diff options
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r-- | Documentation/filesystems/00-INDEX | 2 | ||||
-rw-r--r-- | Documentation/filesystems/Locking | 6 | ||||
-rw-r--r-- | Documentation/filesystems/afs.txt | 34 | ||||
-rw-r--r-- | Documentation/filesystems/autofs4-mount-control.txt | 1 | ||||
-rw-r--r-- | Documentation/filesystems/autofs4.txt | 39 | ||||
-rw-r--r-- | Documentation/filesystems/ceph.txt | 5 | ||||
-rw-r--r-- | Documentation/filesystems/configfs/configfs.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/dax.txt | 22 | ||||
-rw-r--r-- | Documentation/filesystems/ext4.txt | 13 | ||||
-rw-r--r-- | Documentation/filesystems/f2fs.txt | 7 | ||||
-rw-r--r-- | Documentation/filesystems/locks.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/logfs.txt | 241 | ||||
-rw-r--r-- | Documentation/filesystems/nfs/nfsroot.txt | 4 | ||||
-rw-r--r-- | Documentation/filesystems/overlayfs.txt | 26 | ||||
-rw-r--r-- | Documentation/filesystems/porting | 10 | ||||
-rw-r--r-- | Documentation/filesystems/proc.txt | 18 | ||||
-rw-r--r-- | Documentation/filesystems/quota.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/sysfs-pci.txt | 2 | ||||
-rw-r--r-- | Documentation/filesystems/vfs.txt | 13 | ||||
-rw-r--r-- | Documentation/filesystems/xfs.txt | 12 |
20 files changed, 119 insertions, 342 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index f66e748fc5e4..b7bd6c9009cc 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -87,8 +87,6 @@ jfs.txt - info and mount options for the JFS filesystem. locks.txt - info on file locking implementations, flock() vs. fcntl(), etc. -logfs.txt - - info on the LogFS flash filesystem. mandatory-locking.txt - info on the Linux implementation of Sys V mandatory file locking. ncpfs.txt diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..fe25787ff6d4 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -20,7 +20,7 @@ prototypes: void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen); struct vfsmount *(*d_automount)(struct path *path); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); @@ -58,7 +58,7 @@ prototypes: int (*permission) (struct inode *, int, unsigned int); int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *); + int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); void (*update_time)(struct inode *, struct timespec *, int); @@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with the page is passed in -"pte" field in fault_env structure. Pointers to entries for other offsets +"pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is diff --git a/Documentation/filesystems/afs.txt b/Documentation/filesystems/afs.txt index ffef91c4e0d6..060da408923b 100644 --- a/Documentation/filesystems/afs.txt +++ b/Documentation/filesystems/afs.txt @@ -64,8 +64,7 @@ USAGE When inserting the driver modules the root cell must be specified along with a list of volume location server IP addresses: - modprobe af_rxrpc - modprobe rxkad + modprobe rxrpc modprobe kafs rootcell=cambridge.redhat.com:172.16.18.73:172.16.18.91 The first module is the AF_RXRPC network protocol driver. This provides the @@ -214,34 +213,3 @@ If a file is opened with a particular key and then the file descriptor is passed to a process that doesn't have that key (perhaps over an AF_UNIX socket), then the operations on the file will be made with key that was used to open the file. - - -======== -EXAMPLES -======== - -Here's what I use to test this. Some of the names and IP addresses are local -to my internal DNS. My "root.afs" partition has a mount point within it for -some public volumes volumes. - -insmod /tmp/rxrpc.o -insmod /tmp/rxkad.o -insmod /tmp/kafs.o rootcell=cambridge.redhat.com:172.16.18.91 - -mount -t afs \%root.afs. /afs -mount -t afs \%cambridge.redhat.com:root.cell. /afs/cambridge.redhat.com/ - -echo add grand.central.org 18.9.48.14:128.2.203.61:130.237.48.87 > /proc/fs/afs/cells -mount -t afs "#grand.central.org:root.cell." /afs/grand.central.org/ -mount -t afs "#grand.central.org:root.archive." /afs/grand.central.org/archive -mount -t afs "#grand.central.org:root.contrib." /afs/grand.central.org/contrib -mount -t afs "#grand.central.org:root.doc." /afs/grand.central.org/doc -mount -t afs "#grand.central.org:root.project." /afs/grand.central.org/project -mount -t afs "#grand.central.org:root.service." /afs/grand.central.org/service -mount -t afs "#grand.central.org:root.software." /afs/grand.central.org/software -mount -t afs "#grand.central.org:root.user." /afs/grand.central.org/user - -umount /afs -rmmod kafs -rmmod rxkad -rmmod rxrpc diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs4-mount-control.txt index 50a3e01a36f8..e5177cb31a04 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs4-mount-control.txt @@ -179,6 +179,7 @@ struct autofs_dev_ioctl { * including this struct */ __s32 ioctlfd; /* automount command fd */ + /* Command parameters */ union { struct args_protover protover; struct args_protosubver protosubver; diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs4.txt index 8fac3fe7b8c9..f10dd590f69f 100644 --- a/Documentation/filesystems/autofs4.txt +++ b/Documentation/filesystems/autofs4.txt @@ -65,7 +65,7 @@ directory is a mount trap only if the filesystem is mounted *direct* and the root is empty. Directories created in the root directory are mount traps only if the -filesystem is mounted *indirect* and they are empty. +filesystem is mounted *indirect* and they are empty. Directories further down the tree depend on the *maxproto* mount option and particularly whether it is less than five or not. @@ -352,7 +352,7 @@ Communicating with autofs: root directory ioctls ------------------------------------------------ The root directory of an autofs filesystem will respond to a number of -ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN +ioctls. The process issuing the ioctl must have the CAP_SYS_ADMIN capability, or must be the automount daemon. The available ioctl commands are: @@ -425,8 +425,20 @@ Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: * including this struct */ __s32 ioctlfd; /* automount command fd */ - __u32 arg1; /* Command parameters */ - __u32 arg2; + /* Command parameters */ + union { + struct args_protover protover; + struct args_protosubver protosubver; + struct args_openmount openmount; + struct args_ready ready; + struct args_fail fail; + struct args_setpipefd setpipefd; + struct args_timeout timeout; + struct args_requester requester; + struct args_expire expire; + struct args_askumount askumount; + struct args_ismountpoint ismountpoint; + }; char path[0]; }; @@ -446,25 +458,22 @@ Commands are: set version numbers. - **AUTOFS_DEV_IOCTL_OPENMOUNT_CMD**: return an open file descriptor on the root of an autofs filesystem. The filesystem is identified - by name and device number, which is stored in `arg1`. Device - numbers for existing filesystems can be found in + by name and device number, which is stored in `openmount.devid`. + Device numbers for existing filesystems can be found in `/proc/self/mountinfo`. - **AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD**: same as `close(ioctlfd)`. - **AUTOFS_DEV_IOCTL_SETPIPEFD_CMD**: if the filesystem is in catatonic mode, this can provide the write end of a new pipe - in `arg1` to re-establish communication with a daemon. The - process group of the calling process is used to identify the + in `setpipefd.pipefd` to re-establish communication with a daemon. + The process group of the calling process is used to identify the daemon. - **AUTOFS_DEV_IOCTL_REQUESTER_CMD**: `path` should be a name within the filesystem that has been auto-mounted on. - arg1 is the dev number of the underlying autofs. On successful - return, `arg1` and `arg2` will be the UID and GID of the process - which triggered that mount. - + On successful return, `requester.uid` and `requester.gid` will be + the UID and GID of the process which triggered that mount. - **AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD**: Check if path is a mountpoint of a particular type - see separate documentation for details. - - **AUTOFS_DEV_IOCTL_PROTOVER_CMD**: - **AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD**: - **AUTOFS_DEV_IOCTL_READY_CMD**: @@ -474,7 +483,7 @@ Commands are: - **AUTOFS_DEV_IOCTL_EXPIRE_CMD**: - **AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD**: These all have the same function as the similarly named **AUTOFS_IOC** ioctls, except - that **FAIL** can be given an explicit error number in `arg1` + that **FAIL** can be given an explicit error number in `fail.status` instead of assuming `ENOENT`, and this **EXPIRE** command corresponds to **AUTOFS_IOC_EXPIRE_MULTI**. @@ -512,7 +521,7 @@ always be mounted "shared". e.g. > `mount --make-shared /autofs/mount/point` -The automount daemon is only able to mange a single mount location for +The automount daemon is only able to manage a single mount location for an autofs filesystem and if mounts on that are not 'shared', other locations will not behave as expected. In particular access to those other locations will likely result in the `ELOOP` error diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt index f5306ee40ea9..0b302a11718a 100644 --- a/Documentation/filesystems/ceph.txt +++ b/Documentation/filesystems/ceph.txt @@ -98,11 +98,10 @@ Mount Options size. rsize=X - Specify the maximum read size in bytes. By default there is no - maximum. + Specify the maximum read size in bytes. Default: 64 MB. rasize=X - Specify the maximum readahead. + Specify the maximum readahead. Default: 8 MB. mount_timeout=X Specify the timeout value for mount (in seconds), in the case diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt index 8ec9136aae56..3828e85345ae 100644 --- a/Documentation/filesystems/configfs/configfs.txt +++ b/Documentation/filesystems/configfs/configfs.txt @@ -174,7 +174,7 @@ among other things. For that, it needs a type. void (*release)(struct config_item *); int (*allow_link)(struct config_item *src, struct config_item *target); - int (*drop_link)(struct config_item *src, + void (*drop_link)(struct config_item *src, struct config_item *target); }; diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt index 23d18b8a49d5..a7e6e14aeb08 100644 --- a/Documentation/filesystems/dax.txt +++ b/Documentation/filesystems/dax.txt @@ -58,22 +58,22 @@ Implementation Tips for Filesystem Writers Filesystem support consists of - adding support to mark inodes as being DAX by setting the S_DAX flag in i_flags -- implementing the direct_IO address space operation, and calling - dax_do_io() instead of blockdev_direct_IO() if S_DAX is set +- implementing ->read_iter and ->write_iter operations which use dax_iomap_rw() + when inode has S_DAX flag set - implementing an mmap file operation for DAX files which sets the VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to - include handlers for fault, pmd_fault and page_mkwrite (which should - probably call dax_fault(), dax_pmd_fault() and dax_mkwrite(), passing the - appropriate get_block() callback) -- calling dax_truncate_page() instead of block_truncate_page() for DAX files -- calling dax_zero_page_range() instead of zero_user() for DAX files + include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These + handlers should probably call dax_iomap_fault() (for fault and page_mkwrite + handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate + iomap operations. +- calling iomap_zero_range() passing appropriate iomap operations instead of + block_truncate_page() for DAX files - ensuring that there is sufficient locking between reads, writes, truncates and page faults -The get_block() callback passed to the DAX functions may return -uninitialised extents. If it does, it must ensure that simultaneous -calls to get_block() (for example by a page-fault racing with a read() -or a write()) work correctly. +The iomap handlers for allocating blocks must make sure that allocated blocks +are zeroed out and converted to written extents before being returned to avoid +exposure of uninitialized data through mmap. These filesystems may be used for inspiration: - ext2: see Documentation/filesystems/ext2.txt diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 6c0108eb0137..3698ed3146e3 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -351,14 +351,13 @@ nouid32 Disables 32-bit UIDs and GIDs. This is for interoperability with older kernels which only store and expect 16-bit values. -block_validity This options allows to enables/disables the in-kernel +block_validity(*) These options enable or disable the in-kernel noblock_validity facility for tracking filesystem metadata blocks - within internal data structures. This allows multi- - block allocator and other routines to quickly locate - extents which might overlap with filesystem metadata - blocks. This option is intended for debugging - purposes and since it negatively affects the - performance, it is off by default. + within internal data structures. This allows multi- + block allocator and other routines to notice + bugs or corrupted allocation bitmaps which cause + blocks to be allocated which overlap with + filesystem metadata blocks. dioread_lock Controls whether or not ext4 should use the DIO read dioread_nolock locking. If the dioread_nolock option is specified diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..4f6531a4701b 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -125,13 +125,14 @@ active_logs=%u Support configuring the number of active logs. In the disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. -noinline_dentry Diable the inline dentry feature. +noinline_dentry Disable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -157,6 +158,8 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". ================================================================================ DEBUGFS ENTRIES @@ -174,7 +177,7 @@ f2fs. Each file shows the whole f2fs information. SYSFS ENTRIES ================================================================================ -Information about mounted f2f2 file systems can be found in +Information about mounted f2fs file systems can be found in /sys/fs/f2fs. Each mounted filesystem will have a directory in /sys/fs/f2fs based on its device name (i.e., /sys/fs/f2fs/sda). The files in each per-device directory are shown in table below. diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.txt index 2cf81082581d..5368690f412e 100644 --- a/Documentation/filesystems/locks.txt +++ b/Documentation/filesystems/locks.txt @@ -19,7 +19,7 @@ forever. This should not cause problems for anybody, since everybody using a 2.1.x kernel should have updated their C library to a suitable version -anyway (see the file "Documentation/Changes".) +anyway (see the file "Documentation/process/changes.rst".) 1.2 Allow Mixed Locks Again --------------------------- diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt deleted file mode 100644 index bca42c22a143..000000000000 --- a/Documentation/filesystems/logfs.txt +++ /dev/null @@ -1,241 +0,0 @@ - -The LogFS Flash Filesystem -========================== - -Specification -============= - -Superblocks ------------ - -Two superblocks exist at the beginning and end of the filesystem. -Each superblock is 256 Bytes large, with another 3840 Bytes reserved -for future purposes, making a total of 4096 Bytes. - -Superblock locations may differ for MTD and block devices. On MTD the -first non-bad block contains a superblock in the first 4096 Bytes and -the last non-bad block contains a superblock in the last 4096 Bytes. -On block devices, the first 4096 Bytes of the device contain the first -superblock and the last aligned 4096 Byte-block contains the second -superblock. - -For the most part, the superblocks can be considered read-only. They -are written only to correct errors detected within the superblocks, -move the journal and change the filesystem parameters through tunefs. -As a result, the superblock does not contain any fields that require -constant updates, like the amount of free space, etc. - -Segments --------- - -The space in the device is split up into equal-sized segments. -Segments are the primary write unit of LogFS. Within each segments, -writes happen from front (low addresses) to back (high addresses. If -only a partial segment has been written, the segment number, the -current position within and optionally a write buffer are stored in -the journal. - -Segments are erased as a whole. Therefore Garbage Collection may be -required to completely free a segment before doing so. - -Journal --------- - -The journal contains all global information about the filesystem that -is subject to frequent change. At mount time, it has to be scanned -for the most recent commit entry, which contains a list of pointers to -all currently valid entries. - -Object Store ------------- - -All space except for the superblocks and journal is part of the object -store. Each segment contains a segment header and a number of -objects, each consisting of the object header and the payload. -Objects are either inodes, directory entries (dentries), file data -blocks or indirect blocks. - -Levels ------- - -Garbage collection (GC) may fail if all data is written -indiscriminately. One requirement of GC is that data is separated -roughly according to the distance between the tree root and the data. -Effectively that means all file data is on level 0, indirect blocks -are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks, -respectively. Inode file data is on level 6 for the inodes and 7-11 -for indirect blocks. - -Each segment contains objects of a single level only. As a result, -each level requires its own separate segment to be open for writing. - -Inode File ----------- - -All inodes are stored in a special file, the inode file. Single -exception is the inode file's inode (master inode) which for obvious -reasons is stored in the journal instead. Instead of data blocks, the -leaf nodes of the inode files are inodes. - -Aliases -------- - -Writes in LogFS are done by means of a wandering tree. A naïve -implementation would require that for each write or a block, all -parent blocks are written as well, since the block pointers have -changed. Such an implementation would not be very efficient. - -In LogFS, the block pointer changes are cached in the journal by means -of alias entries. Each alias consists of its logical address - inode -number, block index, level and child number (index into block) - and -the changed data. Any 8-byte word can be changes in this manner. - -Currently aliases are used for block pointers, file size, file used -bytes and the height of an inodes indirect tree. - -Segment Aliases ---------------- - -Related to regular aliases, these are used to handle bad blocks. -Initially, bad blocks are handled by moving the affected segment -content to a spare segment and noting this move in the journal with a -segment alias, a simple (to, from) tupel. GC will later empty this -segment and the alias can be removed again. This is used on MTD only. - -Vim ---- - -By cleverly predicting the life time of data, it is possible to -separate long-living data from short-living data and thereby reduce -the GC overhead later. Each type of distinc life expectency (vim) can -have a separate segment open for writing. Each (level, vim) tupel can -be open just once. If an open segment with unknown vim is encountered -at mount time, it is closed and ignored henceforth. - -Indirect Tree -------------- - -Inodes in LogFS are similar to FFS-style filesystems with direct and -indirect block pointers. One difference is that LogFS uses a single -indirect pointer that can be either a 1x, 2x, etc. indirect pointer. -A height field in the inode defines the height of the indirect tree -and thereby the indirection of the pointer. - -Another difference is the addressing of indirect blocks. In LogFS, -the first 16 pointers in the first indirect block are left empty, -corresponding to the 16 direct pointers in the inode. In ext2 (maybe -others as well) the first pointer in the first indirect block -corresponds to logical block 12, skipping the 12 direct pointers. -So where ext2 is using arithmetic to better utilize space, LogFS keeps -arithmetic simple and uses compression to save space. - -Compression ------------ - -Both file data and metadata can be compressed. Compression for file -data can be enabled with chattr +c and disabled with chattr -c. Doing -so has no effect on existing data, but new data will be stored -accordingly. New inodes will inherit the compression flag of the -parent directory. - -Metadata is always compressed. However, the space accounting ignores -this and charges for the uncompressed size. Failing to do so could -result in GC failures when, after moving some data, indirect blocks -compress worse than previously. Even on a 100% full medium, GC may -not consume any extra space, so the compression gains are lost space -to the user. - -However, they are not lost space to the filesystem internals. By -cheating the user for those bytes, the filesystem gained some slack -space and GC will run less often and faster. - -Garbage Collection and Wear Leveling ------------------------------------- - -Garbage collection is invoked whenever the number of free segments -falls below a threshold. The best (known) candidate is picked based -on the least amount of valid data contained in the segment. All -remaining valid data is copied elsewhere, thereby invalidating it. - -The GC code also checks for aliases and writes then back if their -number gets too large. - -Wear leveling is done by occasionally picking a suboptimal segment for -garbage collection. If a stale segments erase count is significantly -lower than the active segments' erase counts, it will be picked. Wear -leveling is rate limited, so it will never monopolize the device for -more than one segment worth at a time. - -Values for "occasionally", "significantly lower" are compile time -constants. - -Hashed directories ------------------- - -To satisfy efficient lookup(), directory entries are hashed and -located based on the hash. In order to both support large directories -and not be overly inefficient for small directories, several hash -tables of increasing size are used. For each table, the hash value -modulo the table size gives the table index. - -Tables sizes are chosen to limit the number of indirect blocks with a -fully populated table to 0, 1, 2 or 3 respectively. So the first -table contains 16 entries, the second 512-16, etc. - -The last table is special in several ways. First its size depends on -the effective 32bit limit on telldir/seekdir cookies. Since logfs -uses the upper half of the address space for indirect blocks, the size -is limited to 2^31. Secondly the table contains hash buckets with 16 -entries each. - -Using single-entry buckets would result in birthday "attacks". At -just 2^16 used entries, hash collisions would be likely (P >= 0.5). -My math skills are insufficient to do the combinatorics for the 17x -collisions necessary to overflow a bucket, but testing showed that in -10,000 runs the lowest directory fill before a bucket overflow was -188,057,130 entries with an average of 315,149,915 entries. So for -directory sizes of up to a million, bucket overflows should be -virtually impossible under normal circumstances. - -With carefully chosen filenames, it is obviously possible to cause an -overflow with just 21 entries (4 higher tables + 16 entries + 1). So -there may be a security concern if a malicious user has write access -to a directory. - -Open For Discussion -=================== - -Device Address Space --------------------- - -A device address space is used for caching. Both block devices and -MTD provide functions to either read a single page or write a segment. -Partial segments may be written for data integrity, but where possible -complete segments are written for performance on simple block device -flash media. - -Meta Inodes ------------ - -Inodes are stored in the inode file, which is just a regular file for -most purposes. At umount time, however, the inode file needs to -remain open until all dirty inodes are written. So -generic_shutdown_super() may not close this inode, but shouldn't -complain about remaining inodes due to the inode file either. Same -goes for mapping inode of the device address space. - -Currently logfs uses a hack that essentially copies part of fs/inode.c -code over. A general solution would be preferred. - -Indirect block mapping ----------------------- - -With compression, the block device (or mapping inode) cannot be used -to cache indirect blocks. Some other place is required. Currently -logfs uses the top half of each inode's address space. The low 8TB -(on 32bit) are filled with file data, the high 8TB are used for -indirect blocks. - -One problem is that 16TB files created on 64bit systems actually have -data in the top 8TB. But files >16TB would cause problems anyway, so -only the limit has changed. diff --git a/Documentation/filesystems/nfs/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt index 0b2883b17d4c..5efae00f6c7f 100644 --- a/Documentation/filesystems/nfs/nfsroot.txt +++ b/Documentation/filesystems/nfs/nfsroot.txt @@ -11,7 +11,7 @@ Updated 2006 by Horms <horms@verge.net.au> In order to use a diskless system, such as an X-terminal or printer server for example, it is necessary for the root filesystem to be present on a non-disk device. This may be an initramfs (see Documentation/filesystems/ -ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/initrd.txt) or a +ramfs-rootfs-initramfs.txt), a ramdisk (see Documentation/admin-guide/initrd.rst) or a filesystem mounted via NFS. The following text describes on how to use NFS for the root filesystem. For the rest of this text 'client' means the diskless system, and 'server' means the NFS server. @@ -284,7 +284,7 @@ They depend on various facilities being available: "kernel <relative-path-below /tftpboot>". The nfsroot parameters are passed to the kernel by adding them to the "append" line. It is common to use serial console in conjunction with pxeliunx, - see Documentation/serial-console.txt for more information. + see Documentation/admin-guide/serial-console.rst for more information. For more information on isolinux, including how to create bootdisks for prebuilt kernels, see http://syslinux.zytor.com/ diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index bcbf9710e4af..634d03e20c2d 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -66,7 +66,7 @@ At mount time, the two directories given as mount options "lowerdir" and "upperdir" are combined into a merged directory: mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\ -workdir=/work /merged + workdir=/work /merged The "workdir" needs to be an empty directory on the same filesystem as upperdir. @@ -118,6 +118,7 @@ programs. seek offsets are assigned sequentially when the directories are read. Thus if + - read part of a directory - remember an offset, and close the directory - re-open the directory some time later @@ -130,6 +131,23 @@ directory. Readdir on directories that are not merged is simply handled by the underlying directory (upper or lower). +renaming directories +-------------------- + +When renaming a directory that is on the lower layer or merged (i.e. the +directory was not created on the upper layer to start with) overlayfs can +handle it in two different ways: + +1. return EXDEV error: this error is returned by rename(2) when trying to + move a file or directory across filesystem boundaries. Hence + applications are usually prepared to hande this error (mv(1) for example + recursively copies the directory tree). This is the default behavior. + +2. If the "redirect_dir" feature is enabled, then the directory will be + copied up (but not the contents). Then the "trusted.overlay.redirect" + extended attribute is set to the path of the original location from the + root of the overlay. Finally the directory is moved to the new + location. Non-directories --------------- @@ -185,13 +203,13 @@ filesystem, so both st_dev and st_ino of the file may change. Any open files referring to this inode will access the old data. -Any file locks (and leases) obtained before copy_up will not apply -to the copied up file. - If a file with multiple hard links is copied up, then this will "break" the link. Changes will not be propagated to other names referring to the same inode. +Unless "redirect_dir" feature is enabled, rename(2) on a lower or merged +directory will fail with EXDEV. + Changes to underlying filesystems --------------------------------- diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index bdd025ceb763..5fb17f49f7a2 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -596,3 +596,13 @@ in your dentry operations instead. [mandatory] ->rename() has an added flags argument. Any flags not handled by the filesystem should result in EINVAL being returned. +-- +[recommended] + ->readlink is optional for symlinks. Don't set, unless filesystem needs + to fake something for readlink(2). +-- +[mandatory] + ->getattr() is now passed a struct path rather than a vfsmount and + dentry separately, and it now has request_mask and query_flags arguments + to specify the fields and sync type requested by statx. Filesystems not + supporting any statx-specific features may ignore the new arguments. diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 74329fd0add2..c94b4675d021 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -191,6 +191,7 @@ read the file /proc/PID/status: CapPrm: 0000000000000000 CapEff: 0000000000000000 CapBnd: ffffffffffffffff + NoNewPrivs: 0 Seccomp: 0 voluntary_ctxt_switches: 0 nonvoluntary_ctxt_switches: 1 @@ -211,10 +212,11 @@ asynchronous manner and the value may not be very precise. To see a precise snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table. It's slow but very precise. -Table 1-2: Contents of the status files (as of 4.1) +Table 1-2: Contents of the status files (as of 4.8) .............................................................................. Field Content Name filename of the executable + Umask file mode creation mask State state (R is running, S is sleeping, D is sleeping in an uninterruptible wait, Z is zombie, T is traced or stopped) @@ -225,7 +227,6 @@ Table 1-2: Contents of the status files (as of 4.1) TracerPid PID of process tracing this process (0 if not) Uid Real, effective, saved set, and file system UIDs Gid Real, effective, saved set, and file system GIDs - Umask file mode creation mask FDSize number of file descriptor slots currently allocated Groups supplementary group list NStgid descendant namespace thread group ID hierarchy @@ -235,6 +236,7 @@ Table 1-2: Contents of the status files (as of 4.1) VmPeak peak virtual memory size VmSize total program size VmLck locked memory size + VmPin pinned memory size VmHWM peak resident set size ("high water mark") VmRSS size of memory portions. It contains the three following parts (VmRSS = RssAnon + RssFile + RssShmem) @@ -262,6 +264,7 @@ Table 1-2: Contents of the status files (as of 4.1) CapPrm bitmap of permitted capabilities CapEff bitmap of effective capabilities CapBnd bitmap of capabilities bounding set + NoNewPrivs no_new_privs, like prctl(PR_GET_NO_NEW_PRIV, ...) Seccomp seccomp mode, like prctl(PR_GET_SECCOMP, ...) Cpus_allowed mask of CPUs on which this process may run Cpus_allowed_list Same as previous, but in "list format" @@ -1305,7 +1308,16 @@ second). The meanings of the columns are as follows, from left to right: - nice: niced processes executing in user mode - system: processes executing in kernel mode - idle: twiddling thumbs -- iowait: waiting for I/O to complete +- iowait: In a word, iowait stands for waiting for I/O to complete. But there + are several problems: + 1. Cpu will not wait for I/O to complete, iowait is the time that a task is + waiting for I/O to complete. When cpu goes into idle state for + outstanding task io, another task will be scheduled on this CPU. + 2. In a multi-core CPU, the task waiting for I/O to complete is not running + on any CPU, so the iowait of each CPU is difficult to calculate. + 3. The value of iowait field in /proc/stat will decrease in certain + conditions. + So, the iowait is not reliable by reading from /proc/stat. - irq: servicing interrupts - softirq: servicing softirqs - steal: involuntary wait diff --git a/Documentation/filesystems/quota.txt b/Documentation/filesystems/quota.txt index 29fc01552646..32874b06ebe9 100644 --- a/Documentation/filesystems/quota.txt +++ b/Documentation/filesystems/quota.txt @@ -6,7 +6,7 @@ Quota subsystem allows system administrator to set limits on used space and number of used inodes (inode is a filesystem structure which is associated with each file or directory) for users and/or groups. For both used space and number of used inodes there are actually two limits. The first one is called softlimit -and the second one hardlimit. An user can never exceed a hardlimit for any +and the second one hardlimit. A user can never exceed a hardlimit for any resource (unless he has CAP_SYS_RESOURCE capability). User is allowed to exceed softlimit but only for limited period of time. This period is called "grace period" or "grace time". When grace time is over, user is not able to allocate diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 74eaac26f8b8..6ea1ceda6f52 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -17,6 +17,7 @@ that support it. For example, a given bus might look like this: | |-- resource0 | |-- resource1 | |-- resource2 + | |-- revision | |-- rom | |-- subsystem_device | |-- subsystem_vendor @@ -41,6 +42,7 @@ files, each with their own function. resource PCI resource host addresses (ascii, ro) resource0..N PCI resource N, if present (binary, mmap, rw[1]) resource0_wc..N_wc PCI WC map resource N, if prefetchable (binary, mmap) + revision PCI revision (ascii, ro) rom PCI ROM resource, if present (binary, ro) subsystem_device PCI subsystem device (ascii, ro) subsystem_vendor PCI subsystem vendor (ascii, ro) diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index b5039a00caaf..94dd27ef4a76 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -382,7 +382,7 @@ struct inode_operations { int (*permission) (struct inode *, int); int (*get_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); - int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); + int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, @@ -451,9 +451,6 @@ otherwise noted. exist; this is checked by the VFS. Unlike plain rename, source and target may be of different type. - readlink: called by the readlink(2) system call. Only required if - you want to support reading symbolic links - get_link: called by the VFS to follow a symbolic link to the inode it points to. Only required if you want to support symbolic links. This method returns the symlink body @@ -468,6 +465,12 @@ otherwise noted. argument. If request can't be handled without leaving RCU mode, have it return ERR_PTR(-ECHILD). + readlink: this is now just an override for use by readlink(2) for the + cases when ->get_link uses nd_jump_link() or object is not in + fact a symlink. Normally filesystems should only implement + ->get_link for symlinks and readlink(2) will automatically use + that. + permission: called by the VFS to check for access rights on a POSIX-like filesystem. @@ -948,7 +951,7 @@ struct dentry_operations { void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); - int (*d_manage)(struct dentry *, bool); + int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *, unsigned int); }; diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt index c2d44e6e117b..3b9b5c149f32 100644 --- a/Documentation/filesystems/xfs.txt +++ b/Documentation/filesystems/xfs.txt @@ -51,13 +51,6 @@ default behaviour. CRC enabled filesystems always use the attr2 format, and so will reject the noattr2 mount option if it is set. - barrier (*) - nobarrier - Enables/disables the use of block layer write barriers for - writes into the journal and for data integrity operations. - This allows for drive level write caching to be enabled, for - devices that support write barriers. - discard nodiscard (*) Enable/disable the issuing of commands to let the block @@ -228,7 +221,10 @@ default behaviour. Deprecated Mount Options ======================== -None at present. + Name Removal Schedule + ---- ---------------- + barrier no earlier than v4.15 + nobarrier no earlier than v4.15 Removed Mount Options |