diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-07-18 17:44:06 +1000 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2014-07-18 17:44:06 +1000 |
commit | 706bf313be879cb232ccb0f7888f4cea85c798bf (patch) | |
tree | 16a36f6932f9b8e7c7e99aa19281d7b185be58d4 | |
parent | 4cba168ca6518e61ea29757e390a9eee4759075d (diff) | |
parent | 9f5b39070c6c1ea062cea18ecd3e767e1dc3671a (diff) |
Merge branch 'akpm-current/current'
Conflicts:
arch/x86/syscalls/syscall_64.tbl
403 files changed, 11905 insertions, 5563 deletions
@@ -1381,6 +1381,9 @@ S: 17 rue Danton S: F - 94270 Le Kremlin-Bicêtre S: France +N: Jack Hammer +D: IBM ServeRAID RAID (ips) driver maintenance + N: Greg Hankins E: gregh@cc.gatech.edu D: fixed keyboard driver to separate LED and locking status @@ -1691,6 +1694,10 @@ S: Reading S: RG6 2NU S: United Kingdom +N: Dave Jeffery +E: dhjeffery@gmail.com +D: SCSI hacks and IBM ServeRAID RAID driver maintenance + N: Jakub Jelinek E: jakub@redhat.com W: http://sunsite.mff.cuni.cz/~jj diff --git a/Documentation/ABI/testing/sysfs-fs-nilfs2 b/Documentation/ABI/testing/sysfs-fs-nilfs2 new file mode 100644 index 000000000000..304ba84a973a --- /dev/null +++ b/Documentation/ABI/testing/sysfs-fs-nilfs2 @@ -0,0 +1,269 @@ + +What: /sys/fs/nilfs2/features/revision +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show current revision of NILFS file system driver. + This value informs about file system revision that + driver is ready to support. + +What: /sys/fs/nilfs2/features/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/features group. + +What: /sys/fs/nilfs2/<device>/revision +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show NILFS file system revision on volume. + This value informs about metadata structures' + revision on mounted volume. + +What: /sys/fs/nilfs2/<device>/blocksize +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show volume's block size in bytes. + +What: /sys/fs/nilfs2/<device>/device_size +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show volume size in bytes. + +What: /sys/fs/nilfs2/<device>/free_blocks +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show count of free blocks on volume. + +What: /sys/fs/nilfs2/<device>/uuid +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show volume's UUID (Universally Unique Identifier). + +What: /sys/fs/nilfs2/<device>/volume_name +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show volume's label. + +What: /sys/fs/nilfs2/<device>/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device> group. + +What: /sys/fs/nilfs2/<device>/superblock/sb_write_time +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show last write time of super block in human-readable + format. + +What: /sys/fs/nilfs2/<device>/superblock/sb_write_time_secs +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show last write time of super block in seconds. + +What: /sys/fs/nilfs2/<device>/superblock/sb_write_count +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show current write count of super block. + +What: /sys/fs/nilfs2/<device>/superblock/sb_update_frequency +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show/Set interval of periodical update of superblock + (in seconds). + +What: /sys/fs/nilfs2/<device>/superblock/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device>/superblock + group. + +What: /sys/fs/nilfs2/<device>/segctor/last_pseg_block +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show start block number of the latest segment. + +What: /sys/fs/nilfs2/<device>/segctor/last_seg_sequence +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show sequence value of the latest segment. + +What: /sys/fs/nilfs2/<device>/segctor/last_seg_checkpoint +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show checkpoint number of the latest segment. + +What: /sys/fs/nilfs2/<device>/segctor/current_seg_sequence +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show segment sequence counter. + +What: /sys/fs/nilfs2/<device>/segctor/current_last_full_seg +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show index number of the latest full segment. + +What: /sys/fs/nilfs2/<device>/segctor/next_full_seg +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show index number of the full segment index + to be used next. + +What: /sys/fs/nilfs2/<device>/segctor/next_pseg_offset +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show offset of next partial segment in the current + full segment. + +What: /sys/fs/nilfs2/<device>/segctor/next_checkpoint +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show next checkpoint number. + +What: /sys/fs/nilfs2/<device>/segctor/last_seg_write_time +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show write time of the last segment in + human-readable format. + +What: /sys/fs/nilfs2/<device>/segctor/last_seg_write_time_secs +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show write time of the last segment in seconds. + +What: /sys/fs/nilfs2/<device>/segctor/last_nongc_write_time +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show write time of the last segment not for cleaner + operation in human-readable format. + +What: /sys/fs/nilfs2/<device>/segctor/last_nongc_write_time_secs +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show write time of the last segment not for cleaner + operation in seconds. + +What: /sys/fs/nilfs2/<device>/segctor/dirty_data_blocks_count +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of dirty data blocks. + +What: /sys/fs/nilfs2/<device>/segctor/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device>/segctor + group. + +What: /sys/fs/nilfs2/<device>/segments/segments_number +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of segments on a volume. + +What: /sys/fs/nilfs2/<device>/segments/blocks_per_segment +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of blocks in segment. + +What: /sys/fs/nilfs2/<device>/segments/clean_segments +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show count of clean segments. + +What: /sys/fs/nilfs2/<device>/segments/dirty_segments +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show count of dirty segments. + +What: /sys/fs/nilfs2/<device>/segments/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device>/segments + group. + +What: /sys/fs/nilfs2/<device>/checkpoints/checkpoints_number +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of checkpoints on volume. + +What: /sys/fs/nilfs2/<device>/checkpoints/snapshots_number +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of snapshots on volume. + +What: /sys/fs/nilfs2/<device>/checkpoints/last_seg_checkpoint +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show checkpoint number of the latest segment. + +What: /sys/fs/nilfs2/<device>/checkpoints/next_checkpoint +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show next checkpoint number. + +What: /sys/fs/nilfs2/<device>/checkpoints/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device>/checkpoints + group. + +What: /sys/fs/nilfs2/<device>/mounted_snapshots/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe content of /sys/fs/nilfs2/<device>/mounted_snapshots + group. + +What: /sys/fs/nilfs2/<device>/mounted_snapshots/<id>/inodes_count +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of inodes for snapshot. + +What: /sys/fs/nilfs2/<device>/mounted_snapshots/<id>/blocks_count +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Show number of blocks for snapshot. + +What: /sys/fs/nilfs2/<device>/mounted_snapshots/<id>/README +Date: April 2014 +Contact: "Vyacheslav Dubeyko" <slava@dubeyko.com> +Description: + Describe attributes of /sys/fs/nilfs2/<device>/mounted_snapshots/<id> + group. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 49b8551a3b68..e48c57f1943b 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -818,7 +818,7 @@ RCU pointer/list update: list_add_tail_rcu list_del_rcu list_replace_rcu - hlist_add_after_rcu + hlist_add_behind_rcu hlist_add_before_rcu hlist_add_head_rcu hlist_del_rcu diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index 80ac454704b8..8870b0212150 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,64 +24,27 @@ Please note that implementation details can be changed. a page/swp_entry may be charged (usage += PAGE_SIZE) at - mem_cgroup_charge_anon() - Called at new page fault and Copy-On-Write. - - mem_cgroup_try_charge_swapin() - Called at do_swap_page() (page fault on swap entry) and swapoff. - Followed by charge-commit-cancel protocol. (With swap accounting) - At commit, a charge recorded in swap_cgroup is removed. - - mem_cgroup_charge_file() - Called at add_to_page_cache() - - mem_cgroup_cache_charge_swapin() - Called at shmem's swapin. - - mem_cgroup_prepare_migration() - Called before migration. "extra" charge is done and followed by - charge-commit-cancel protocol. - At commit, charge against oldpage or newpage will be committed. + mem_cgroup_try_charge() 2. Uncharge a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by - mem_cgroup_uncharge_page() - Called when an anonymous page is fully unmapped. I.e., mapcount goes - to 0. If the page is SwapCache, uncharge is delayed until - mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_cache_page() - Called when a page-cache is deleted from radix-tree. If the page is - SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache(). - - mem_cgroup_uncharge_swapcache() - Called when SwapCache is removed from radix-tree. The charge itself - is moved to swap_cgroup. (If mem+swap controller is disabled, no - charge to swap occurs.) + mem_cgroup_uncharge() + Called when a page's refcount goes down to 0. mem_cgroup_uncharge_swap() Called when swp_entry's refcnt goes down to 0. A charge against swap disappears. - mem_cgroup_end_migration(old, new) - At success of migration old is uncharged (if necessary), a charge - to new page is committed. At failure, charge to old page is committed. - 3. charge-commit-cancel - In some case, we can't know this "charge" is valid or not at charging - (because of races). - To handle such case, there are charge-commit-cancel functions. - mem_cgroup_try_charge_XXX - mem_cgroup_commit_charge_XXX - mem_cgroup_cancel_charge_XXX - these are used in swap-in and migration. + Memcg pages are charged in two steps: + mem_cgroup_try_charge() + mem_cgroup_commit_charge() or mem_cgroup_cancel_charge() At try_charge(), there are no flags to say "this page is charged". at this point, usage += PAGE_SIZE. - At commit(), the function checks the page should be charged or not - and set flags or avoid charging.(usage -= PAGE_SIZE) + At commit(), the page is associated with the memcg. At cancel(), simply usage -= PAGE_SIZE. @@ -91,18 +54,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. Anonymous page is newly allocated at - page fault into MAP_ANONYMOUS mapping. - Copy-On-Write. - It is charged right after it's allocated before doing any page table - related operations. Of course, it's uncharged when another page is used - for the fault address. - - At freeing anonymous page (by exit() or munmap()), zap_pte() is called - and pages for ptes are freed one by one.(see mm/memory.c). Uncharges - are done at page_remove_rmap() when page_mapcount() goes down to 0. - - Another page freeing is by page-reclaim (vmscan.c) and anonymous - pages are swapped out. In this case, the page is marked as - PageSwapCache(). uncharge() routine doesn't uncharge the page marked - as SwapCache(). It's delayed until __delete_from_swap_cache(). 4.1 Swap-in. At swap-in, the page is taken from swap-cache. There are 2 cases. @@ -111,41 +62,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. (b) If the SwapCache has been mapped by processes, it has been charged already. - This swap-in is one of the most complicated work. In do_swap_page(), - following events occur when pte is unchanged. - - (1) the page (SwapCache) is looked up. - (2) lock_page() - (3) try_charge_swapin() - (4) reuse_swap_page() (may call delete_swap_cache()) - (5) commit_charge_swapin() - (6) swap_free(). - - Considering following situation for example. - - (A) The page has not been charged before (2) and reuse_swap_page() - doesn't call delete_from_swap_cache(). - (B) The page has not been charged before (2) and reuse_swap_page() - calls delete_from_swap_cache(). - (C) The page has been charged before (2) and reuse_swap_page() doesn't - call delete_from_swap_cache(). - (D) The page has been charged before (2) and reuse_swap_page() calls - delete_from_swap_cache(). - - memory.usage/memsw.usage changes to this page/swp_entry will be - Case (A) (B) (C) (D) - Event - Before (2) 0/ 1 0/ 1 1/ 1 1/ 1 - =========================================== - (3) +1/+1 +1/+1 +1/+1 +1/+1 - (4) - 0/ 0 - -1/ 0 - (5) 0/-1 0/ 0 -1/-1 0/ 0 - (6) - 0/-1 - 0/-1 - =========================================== - Result 1/ 1 1/ 1 1/ 1 1/ 1 - - In any cases, charges to this page should be 1/ 1. - 4.2 Swap-out. At swap-out, typical state transition is below. @@ -158,28 +74,20 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. swp_entry's refcnt -= 1. - At (b), the page is marked as SwapCache and not uncharged. - At (d), the page is removed from SwapCache and a charge in page_cgroup - is moved to swap_cgroup. - Finally, at task exit, (e) zap_pte() is called and swp_entry's refcnt -=1 -> 0. - Here, a charge in swap_cgroup disappears. 5. Page Cache Page Cache is charged at - add_to_page_cache_locked(). - uncharged at - - __remove_from_page_cache(). - The logic is very clear. (About migration, see below) Note: __remove_from_page_cache() is called by remove_from_page_cache() and __remove_mapping(). 6. Shmem(tmpfs) Page Cache - Memcg's charge/uncharge have special handlers of shmem. The best way - to understand shmem's page state transition is to read mm/shmem.c. + The best way to understand shmem's page state transition is to read + mm/shmem.c. But brief explanation of the behavior of memcg around shmem will be helpful to understand the logic. @@ -192,56 +100,10 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y. It's charged when... - A new page is added to shmem's radix-tree. - A swp page is read. (move a charge from swap_cgroup to page_cgroup) - It's uncharged when - - A page is removed from radix-tree and not SwapCache. - - When SwapCache is removed, a charge is moved to swap_cgroup. - - When swp_entry's refcnt goes down to 0, a charge in swap_cgroup - disappears. 7. Page Migration - One of the most complicated functions is page-migration-handler. - Memcg has 2 routines. Assume that we are migrating a page's contents - from OLDPAGE to NEWPAGE. - - Usual migration logic is.. - (a) remove the page from LRU. - (b) allocate NEWPAGE (migration target) - (c) lock by lock_page(). - (d) unmap all mappings. - (e-1) If necessary, replace entry in radix-tree. - (e-2) move contents of a page. - (f) map all mappings again. - (g) pushback the page to LRU. - (-) OLDPAGE will be freed. - - Before (g), memcg should complete all necessary charge/uncharge to - NEWPAGE/OLDPAGE. - - The point is.... - - If OLDPAGE is anonymous, all charges will be dropped at (d) because - try_to_unmap() drops all mapcount and the page will not be - SwapCache. - - - If OLDPAGE is SwapCache, charges will be kept at (g) because - __delete_from_swap_cache() isn't called at (e-1) - - - If OLDPAGE is page-cache, charges will be kept at (g) because - remove_from_swap_cache() isn't called at (e-1) - - memcg provides following hooks. - - - mem_cgroup_prepare_migration(OLDPAGE) - Called after (b) to account a charge (usage += PAGE_SIZE) against - memcg which OLDPAGE belongs to. - - - mem_cgroup_end_migration(OLDPAGE, NEWPAGE) - Called after (f) before (g). - If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already - charged, a charge by prepare_migration() is automatically canceled. - If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE. - - But zap_pte() (by exit or munmap) can be called while migration, - we have to check if OLDPAGE/NEWPAGE is a valid page after commit(). + + mem_cgroup_migrate() 8. LRU Each memcg has its own private LRU. Now, its handling is under global diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 427b1dd12709..7d02a8b077fd 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1716,8 +1716,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. 7 (KERN_DEBUG) debug-level messages log_buf_len=n[KMG] Sets the size of the printk ring buffer, - in bytes. n must be a power of two. The default - size is set in the kernel config file. + in bytes. n must be a power of two and greater + than the minimal size. The minimal size is defined + by LOG_BUF_SHIFT kernel config parameter. There is + also CONFIG_LOG_CPU_MAX_BUF_SHIFT config parameter + that allows to increase the default size depending on + the number of CPUs. See init/Kconfig for more details. logo.nologo [FB] Disables display of the built-in Linux logo. This may be used to provide more screen space for diff --git a/Documentation/leds/leds-class.txt b/Documentation/leds/leds-class.txt index 79699c200766..62261c04060a 100644 --- a/Documentation/leds/leds-class.txt +++ b/Documentation/leds/leds-class.txt @@ -2,9 +2,6 @@ LED handling under Linux ======================== -If you're reading this and thinking about keyboard leds, these are -handled by the input subsystem and the led class is *not* needed. - In its simplest form, the LED class just allows control of LEDs from userspace. LEDs appear in /sys/class/leds/. The maximum brightness of the LED is defined in max_brightness file. The brightness file will set the brightness diff --git a/Documentation/oops-tracing.txt b/Documentation/oops-tracing.txt index e3155995ddd8..beefb9f82902 100644 --- a/Documentation/oops-tracing.txt +++ b/Documentation/oops-tracing.txt @@ -268,6 +268,8 @@ characters, each representing a particular tainted value. 14: 'E' if an unsigned module has been loaded in a kernel supporting module signature. + 15: 'L' if a soft lockup has previously occurred on the system. + The primary reason for the 'Tainted: ' string is to tell kernel debuggers if this is a clean kernel or if anything unusual has occurred. Tainting is permanent: even if an offending module is diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index b4498218c474..3b56a991f52e 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -184,6 +184,12 @@ dentry names: equivalent of %s dentry->d_name.name we used to use, %pd<n> prints n last components. %pD does the same thing for struct file. +task_struct comm name: + + %pT + + For printing task_struct->comm. + struct va_format: %pV diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index c14374e71775..f79eb9666379 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -826,6 +826,7 @@ can be ORed together: 4096 - An out-of-tree module has been loaded. 8192 - An unsigned module has been loaded in a kernel supporting module signature. +16384 - A soft lockup has previously occurred on the system. ============================================================== diff --git a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl index 78c9a7b2b58f..8f961ef2b457 100644 --- a/Documentation/trace/postprocess/trace-vmscan-postprocess.pl +++ b/Documentation/trace/postprocess/trace-vmscan-postprocess.pl @@ -47,6 +47,10 @@ use constant HIGH_KSWAPD_REWAKEUP => 21; use constant HIGH_NR_SCANNED => 22; use constant HIGH_NR_TAKEN => 23; use constant HIGH_NR_RECLAIMED => 24; +use constant HIGH_NR_FILE_SCANNED => 25; +use constant HIGH_NR_ANON_SCANNED => 26; +use constant HIGH_NR_FILE_RECLAIMED => 27; +use constant HIGH_NR_ANON_RECLAIMED => 28; my %perprocesspid; my %perprocess; @@ -56,14 +60,18 @@ my $opt_read_procstat; my $total_wakeup_kswapd; my ($total_direct_reclaim, $total_direct_nr_scanned); +my ($total_direct_nr_file_scanned, $total_direct_nr_anon_scanned); my ($total_direct_latency, $total_kswapd_latency); my ($total_direct_nr_reclaimed); +my ($total_direct_nr_file_reclaimed, $total_direct_nr_anon_reclaimed); my ($total_direct_writepage_file_sync, $total_direct_writepage_file_async); my ($total_direct_writepage_anon_sync, $total_direct_writepage_anon_async); my ($total_kswapd_nr_scanned, $total_kswapd_wake); +my ($total_kswapd_nr_file_scanned, $total_kswapd_nr_anon_scanned); my ($total_kswapd_writepage_file_sync, $total_kswapd_writepage_file_async); my ($total_kswapd_writepage_anon_sync, $total_kswapd_writepage_anon_async); my ($total_kswapd_nr_reclaimed); +my ($total_kswapd_nr_file_reclaimed, $total_kswapd_nr_anon_reclaimed); # Catch sigint and exit on request my $sigint_report = 0; @@ -374,6 +382,7 @@ EVENT_PROCESS: } my $isolate_mode = $1; my $nr_scanned = $4; + my $file = $6; # To closer match vmstat scanning statistics, only count isolate_both # and isolate_inactive as scanning. isolate_active is rotation @@ -382,6 +391,11 @@ EVENT_PROCESS: # isolate_both == 3 if ($isolate_mode != 2) { $perprocesspid{$process_pid}->{HIGH_NR_SCANNED} += $nr_scanned; + if ($file == 1) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED} += $nr_scanned; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED} += $nr_scanned; + } } } elsif ($tracepoint eq "mm_vmscan_lru_shrink_inactive") { $details = $6; @@ -391,8 +405,19 @@ EVENT_PROCESS: print " $regex_lru_shrink_inactive/o\n"; next; } + my $nr_reclaimed = $4; + my $flags = $6; + my $file = 0; + if ($flags =~ /RECLAIM_WB_FILE/) { + $file = 1; + } $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED} += $nr_reclaimed; + if ($file) { + $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED} += $nr_reclaimed; + } else { + $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED} += $nr_reclaimed; + } } elsif ($tracepoint eq "mm_vmscan_writepage") { $details = $6; if ($details !~ /$regex_writepage/o) { @@ -493,7 +518,11 @@ sub dump_stats { $total_direct_reclaim += $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}; $total_wakeup_kswapd += $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $total_direct_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_direct_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_direct_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_direct_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_direct_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_direct_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_direct_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_direct_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_direct_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -513,7 +542,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_DIRECT_RECLAIM_BEGIN}, $stats{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}, $this_reclaim_delay / 1000); @@ -552,7 +585,11 @@ sub dump_stats { $total_kswapd_wake += $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}; $total_kswapd_nr_scanned += $stats{$process_pid}->{HIGH_NR_SCANNED}; + $total_kswapd_nr_file_scanned += $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $total_kswapd_nr_anon_scanned += $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}; $total_kswapd_nr_reclaimed += $stats{$process_pid}->{HIGH_NR_RECLAIMED}; + $total_kswapd_nr_file_reclaimed += $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $total_kswapd_nr_anon_reclaimed += $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $total_kswapd_writepage_file_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $total_kswapd_writepage_anon_sync += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $total_kswapd_writepage_file_async += $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; @@ -563,7 +600,11 @@ sub dump_stats { $stats{$process_pid}->{MM_VMSCAN_KSWAPD_WAKE}, $stats{$process_pid}->{HIGH_KSWAPD_REWAKEUP}, $stats{$process_pid}->{HIGH_NR_SCANNED}, + $stats{$process_pid}->{HIGH_NR_FILE_SCANNED}, + $stats{$process_pid}->{HIGH_NR_ANON_SCANNED}, $stats{$process_pid}->{HIGH_NR_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_FILE_RECLAIMED}, + $stats{$process_pid}->{HIGH_NR_ANON_RECLAIMED}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}, $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} + $stats{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_ASYNC}); @@ -594,7 +635,11 @@ sub dump_stats { print "\nSummary\n"; print "Direct reclaims: $total_direct_reclaim\n"; print "Direct reclaim pages scanned: $total_direct_nr_scanned\n"; + print "Direct reclaim file pages scanned: $total_direct_nr_file_scanned\n"; + print "Direct reclaim anon pages scanned: $total_direct_nr_anon_scanned\n"; print "Direct reclaim pages reclaimed: $total_direct_nr_reclaimed\n"; + print "Direct reclaim file pages reclaimed: $total_direct_nr_file_reclaimed\n"; + print "Direct reclaim anon pages reclaimed: $total_direct_nr_anon_reclaimed\n"; print "Direct reclaim write file sync I/O: $total_direct_writepage_file_sync\n"; print "Direct reclaim write anon sync I/O: $total_direct_writepage_anon_sync\n"; print "Direct reclaim write file async I/O: $total_direct_writepage_file_async\n"; @@ -604,7 +649,11 @@ sub dump_stats { print "\n"; print "Kswapd wakeups: $total_kswapd_wake\n"; print "Kswapd pages scanned: $total_kswapd_nr_scanned\n"; + print "Kswapd file pages scanned: $total_kswapd_nr_file_scanned\n"; + print "Kswapd anon pages scanned: $total_kswapd_nr_anon_scanned\n"; print "Kswapd pages reclaimed: $total_kswapd_nr_reclaimed\n"; + print "Kswapd file pages reclaimed: $total_kswapd_nr_file_reclaimed\n"; + print "Kswapd anon pages reclaimed: $total_kswapd_nr_anon_reclaimed\n"; print "Kswapd reclaim write file sync I/O: $total_kswapd_writepage_file_sync\n"; print "Kswapd reclaim write anon sync I/O: $total_kswapd_writepage_anon_sync\n"; print "Kswapd reclaim write file async I/O: $total_kswapd_writepage_file_async\n"; @@ -629,7 +678,11 @@ sub aggregate_perprocesspid() { $perprocess{$process}->{MM_VMSCAN_WAKEUP_KSWAPD} += $perprocesspid{$process_pid}->{MM_VMSCAN_WAKEUP_KSWAPD}; $perprocess{$process}->{HIGH_KSWAPD_REWAKEUP} += $perprocesspid{$process_pid}->{HIGH_KSWAPD_REWAKEUP}; $perprocess{$process}->{HIGH_NR_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_SCANNED}; + $perprocess{$process}->{HIGH_NR_FILE_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_SCANNED}; + $perprocess{$process}->{HIGH_NR_ANON_SCANNED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_SCANNED}; $perprocess{$process}->{HIGH_NR_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_FILE_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_FILE_RECLAIMED}; + $perprocess{$process}->{HIGH_NR_ANON_RECLAIMED} += $perprocesspid{$process_pid}->{HIGH_NR_ANON_RECLAIMED}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_ANON_SYNC}; $perprocess{$process}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC} += $perprocesspid{$process_pid}->{MM_VMSCAN_WRITEPAGE_FILE_ASYNC}; diff --git a/MAINTAINERS b/MAINTAINERS index fd97e0c8a47c..78e1cb2f5dc3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3221,26 +3221,12 @@ T: git git://linuxtv.org/anttip/media_tree.git S: Maintained F: drivers/media/tuners/e4000* -EATA-DMA SCSI DRIVER -M: Michael Neuffer <mike@i-Connect.Net> -L: linux-eata@i-connect.net -L: linux-scsi@vger.kernel.org -S: Maintained -F: drivers/scsi/eata* - EATA ISA/EISA/PCI SCSI DRIVER M: Dario Ballabio <ballabio_dario@emc.com> L: linux-scsi@vger.kernel.org S: Maintained F: drivers/scsi/eata.c -EATA-PIO SCSI DRIVER -M: Michael Neuffer <mike@i-Connect.Net> -L: linux-eata@i-connect.net -L: linux-scsi@vger.kernel.org -S: Maintained -F: drivers/scsi/eata_pio.* - EC100 MEDIA DRIVER M: Antti Palosaari <crope@iki.fi> L: linux-media@vger.kernel.org @@ -4485,10 +4471,7 @@ S: Supported F: drivers/scsi/ibmvscsi/ibmvfc* IBM ServeRAID RAID DRIVER -P: Jack Hammer -M: Dave Jeffery <ipslinux@adaptec.com> -W: http://www.developer.ibm.com/welcome/netfinity/serveraid.html -S: Supported +S: Orphan F: drivers/scsi/ips.* ICH LPC AND GPIO DRIVER @@ -7174,7 +7157,7 @@ F: drivers/ptp/* F: include/linux/ptp_cl* PTRACE SUPPORT -M: Roland McGrath <roland@redhat.com> +M: Roland McGrath <roland@hack.frob.com> M: Oleg Nesterov <oleg@redhat.com> S: Maintained F: include/asm-generic/syscall.h @@ -637,6 +637,9 @@ else KBUILD_CFLAGS += -O2 endif +# Tell gcc to never replace conditional load with a non-conditional one +KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0) + ifdef CONFIG_READABLE_ASM # Disable optimizations that make assembler listings hard to read. # reorder blocks reorders the control in the function @@ -652,6 +655,22 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) endif # Handle stack protector mode. +# +# Since kbuild can potentially perform two passes (first with the old +# .config values and then with updated .config values), we cannot error out +# if a desired compiler option is unsupported. If we were to error, kbuild +# could never get to the second pass and actually notice that we changed +# the option to something that was supported. +# +# Additionally, we don't want to fallback and/or silently change which compiler +# flags will be used, since that leads to producing kernels with different +# security feature characteristics depending on the compiler used. ("But I +# selected CC_STACKPROTECTOR_STRONG! Why did it build with _REGULAR?!") +# +# The middle ground is to warn here so that the failed option is obvious, but +# to let the build fail with bad compiler flags so that we can't produce a +# kernel when there is a CONFIG and compiler mismatch. +# ifdef CONFIG_CC_STACKPROTECTOR_REGULAR stackp-flag := -fstack-protector ifeq ($(call cc-option, $(stackp-flag)),) diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 96e54bed5088..e858aa0ad8af 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -6,4 +6,5 @@ generic-y += exec.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/alpha/include/asm/scatterlist.h b/arch/alpha/include/asm/scatterlist.h deleted file mode 100644 index 017d7471c3c4..000000000000 --- a/arch/alpha/include/asm/scatterlist.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ALPHA_SCATTERLIST_H -#define _ALPHA_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* !(_ALPHA_SCATTERLIST_H) */ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 1d610c823c5e..77632c416bd9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -84,6 +84,7 @@ config ARM <http://www.arm.linux.org.uk/>. config ARM_HAS_SG_CHAIN + select ARCH_HAS_SG_CHAIN bool config NEED_SG_DMA_LENGTH @@ -1968,6 +1969,8 @@ config XIP_PHYS_ADDR config KEXEC bool "Kexec system call (EXPERIMENTAL)" depends on (!SMP || PM_SLEEP_SMP) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild index f5a357601983..70cd84eb7fda 100644 --- a/arch/arm/include/asm/Kbuild +++ b/arch/arm/include/asm/Kbuild @@ -22,6 +22,7 @@ generic-y += poll.h generic-y += preempt.h generic-y += resource.h generic-y += rwsem.h +generic-y += scatterlist.h generic-y += sections.h generic-y += segment.h generic-y += sembuf.h diff --git a/arch/arm/include/asm/scatterlist.h b/arch/arm/include/asm/scatterlist.h deleted file mode 100644 index cefdb8f898a1..000000000000 --- a/arch/arm/include/asm/scatterlist.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _ASMARM_SCATTERLIST_H -#define _ASMARM_SCATTERLIST_H - -#ifdef CONFIG_ARM_HAS_SG_CHAIN -#define ARCH_HAS_SG_CHAIN -#endif - -#include <asm/memory.h> -#include <asm/types.h> -#include <asm-generic/scatterlist.h> - -#endif /* _ASMARM_SCATTERLIST_H */ diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 1f88db06b133..7a996aaa061e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -26,6 +26,7 @@ #include <linux/io.h> #include <linux/vmalloc.h> #include <linux/sizes.h> +#include <linux/cma.h> #include <asm/memory.h> #include <asm/highmem.h> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 736a42a1242b..4bcd435b6fd4 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2,6 +2,7 @@ config ARM64 def_bool y select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select ARCH_HAS_OPP + select ARCH_HAS_SG_CHAIN select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_USE_CMPXCHG_LOCKREF select ARCH_SUPPORTS_ATOMIC_RMW diff --git a/arch/cris/include/asm/Kbuild b/arch/cris/include/asm/Kbuild index afff5105909d..31742dfadff9 100644 --- a/arch/cris/include/asm/Kbuild +++ b/arch/cris/include/asm/Kbuild @@ -13,6 +13,7 @@ generic-y += linkage.h generic-y += mcs_spinlock.h generic-y += module.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += vga.h generic-y += xor.h diff --git a/arch/cris/include/asm/scatterlist.h b/arch/cris/include/asm/scatterlist.h deleted file mode 100644 index f11f8f40ec4a..000000000000 --- a/arch/cris/include/asm/scatterlist.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef __ASM_CRIS_SCATTERLIST_H -#define __ASM_CRIS_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* !(__ASM_CRIS_SCATTERLIST_H) */ diff --git a/arch/frv/include/asm/Kbuild b/arch/frv/include/asm/Kbuild index 87b95eb8aee5..5b73921b6e9d 100644 --- a/arch/frv/include/asm/Kbuild +++ b/arch/frv/include/asm/Kbuild @@ -5,4 +5,5 @@ generic-y += exec.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/frv/include/asm/scatterlist.h b/arch/frv/include/asm/scatterlist.h deleted file mode 100644 index 0e5eb3018468..000000000000 --- a/arch/frv/include/asm/scatterlist.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCATTERLIST_H -#define _ASM_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* !_ASM_SCATTERLIST_H */ diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 2f3abcf8f6bc..8db0b5f02034 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -27,6 +27,7 @@ config IA64 select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SG_CHAIN select VIRT_TO_BUS select ARCH_DISCARD_MEMBLOCK select GENERIC_IRQ_PROBE @@ -547,6 +548,8 @@ source "drivers/sn/Kconfig" config KEXEC bool "kexec system call" depends on !IA64_HP_SIM && (!SMP || HOTPLUG_CPU) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index 0da4aa2602ae..e8317d2d6c8d 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -5,5 +5,6 @@ generic-y += hash.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/ia64/include/asm/scatterlist.h b/arch/ia64/include/asm/scatterlist.h deleted file mode 100644 index 08fd93bff1db..000000000000 --- a/arch/ia64/include/asm/scatterlist.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_IA64_SCATTERLIST_H -#define _ASM_IA64_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_IA64_SCATTERLIST_H */ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 71c52bc7c28d..a149c6731d65 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -384,21 +384,6 @@ static struct irqaction timer_irqaction = { .name = "timer" }; -static struct platform_device rtc_efi_dev = { - .name = "rtc-efi", - .id = -1, -}; - -static int __init rtc_init(void) -{ - if (platform_device_register(&rtc_efi_dev) < 0) - printk(KERN_ERR "unable to register rtc device...\n"); - - /* not necessarily an error */ - return 0; -} -module_init(rtc_init); - void read_persistent_clock(struct timespec *ts) { efi_gettimeofday(ts); diff --git a/arch/m32r/include/asm/Kbuild b/arch/m32r/include/asm/Kbuild index 67779a74b62d..accc10a3dc78 100644 --- a/arch/m32r/include/asm/Kbuild +++ b/arch/m32r/include/asm/Kbuild @@ -6,4 +6,5 @@ generic-y += hash.h generic-y += mcs_spinlock.h generic-y += module.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/m32r/include/asm/scatterlist.h b/arch/m32r/include/asm/scatterlist.h deleted file mode 100644 index 7370b8b6243e..000000000000 --- a/arch/m32r/include/asm/scatterlist.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_M32R_SCATTERLIST_H -#define _ASM_M32R_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* _ASM_M32R_SCATTERLIST_H */ diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 87b7c7581b1d..3ff8c9a25335 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -91,6 +91,8 @@ config MMU_SUN3 config KEXEC bool "kexec system call" depends on M68KCLASSIC + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 3a480b3df0d6..d2263a0bdc3d 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -376,7 +376,6 @@ cache_flush_060 (unsigned long addr, int scope, int cache, unsigned long len) asmlinkage int sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) { - struct vm_area_struct *vma; int ret = -EINVAL; if (scope < FLUSH_SCOPE_LINE || scope > FLUSH_SCOPE_ALL || @@ -389,16 +388,23 @@ sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) if (!capable(CAP_SYS_ADMIN)) goto out; } else { + struct vm_area_struct *vma; + bool invalid; + + /* Check for overflow. */ + if (addr + len < addr) + goto out; + /* * Verify that the specified address region actually belongs * to this process. */ - vma = find_vma (current->mm, addr); ret = -EINVAL; - /* Check for overflow. */ - if (addr + len < addr) - goto out; - if (vma == NULL || addr < vma->vm_start || addr + len > vma->vm_end) + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, addr); + invalid = !vma || addr < vma->vm_start || addr + len > vma->vm_end; + up_read(¤t->mm->mmap_sem); + if (invalid) goto out; } diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index 35b3ecaf25d5..27a3acda6c19 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -7,5 +7,6 @@ generic-y += exec.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += syscalls.h generic-y += trace_clock.h diff --git a/arch/microblaze/include/asm/scatterlist.h b/arch/microblaze/include/asm/scatterlist.h deleted file mode 100644 index 35d786fe93ae..000000000000 --- a/arch/microblaze/include/asm/scatterlist.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/scatterlist.h> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4e238e6e661c..1f7bd3269efa 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2392,6 +2392,8 @@ source "kernel/Kconfig.preempt" config KEXEC bool "Kexec system call" + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/mn10300/include/asm/Kbuild b/arch/mn10300/include/asm/Kbuild index 654d5ba6e310..ecbd6676bd33 100644 --- a/arch/mn10300/include/asm/Kbuild +++ b/arch/mn10300/include/asm/Kbuild @@ -6,4 +6,5 @@ generic-y += exec.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/mn10300/include/asm/scatterlist.h b/arch/mn10300/include/asm/scatterlist.h deleted file mode 100644 index 7baa4006008a..000000000000 --- a/arch/mn10300/include/asm/scatterlist.h +++ /dev/null @@ -1,16 +0,0 @@ -/* MN10300 Scatterlist definitions - * - * Copyright (C) 2007 Matsushita Electric Industrial Co., Ltd. - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ -#ifndef _ASM_SCATTERLIST_H -#define _ASM_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* _ASM_SCATTERLIST_H */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 80b94b0add1f..a577609f8ed6 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -111,6 +111,7 @@ config PPC select HAVE_DMA_API_DEBUG select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK + select ARCH_HAS_SG_CHAIN select GENERIC_ATOMIC64 if PPC32 select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_PERF_EVENTS @@ -398,6 +399,8 @@ config PPC64_SUPPORTS_MEMORY_FAILURE config KEXEC bool "kexec system call" depends on (PPC_BOOK3S || FSL_BOOKE || (44x && !SMP)) + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 3fb1bc432f4f..7f23f162ce9c 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -4,5 +4,6 @@ generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h generic-y += rwsem.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += vtime.h diff --git a/arch/powerpc/include/asm/scatterlist.h b/arch/powerpc/include/asm/scatterlist.h deleted file mode 100644 index de1f620bd5c9..000000000000 --- a/arch/powerpc/include/asm/scatterlist.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _ASM_POWERPC_SCATTERLIST_H -#define _ASM_POWERPC_SCATTERLIST_H -/* - * Copyright (C) 2001 PPC64 Team, IBM Corp - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <asm/dma.h> -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_POWERPC_SCATTERLIST_H */ diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index ce569b6bf4d8..72905c30082e 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -90,7 +90,6 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ book3s_hv_rm_mmu.o \ book3s_hv_ras.o \ book3s_hv_builtin.o \ - book3s_hv_cma.o \ $(kvm-book3s_64-builtin-xics-objs-y) endif diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 2d154d9319b3..93218ae41f23 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -37,8 +37,6 @@ #include <asm/ppc-opcode.h> #include <asm/cputable.h> -#include "book3s_hv_cma.h" - /* POWER7 has 10-bit LPIDs, PPC970 has 6-bit LPIDs */ #define MAX_LPID_970 63 @@ -64,10 +62,10 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } kvm->arch.hpt_cma_alloc = 0; - VM_BUG_ON(order < KVM_CMA_CHUNK_ORDER); page = kvm_alloc_hpt(1 << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1 << order)); kvm->arch.hpt_cma_alloc = 1; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 3b41447482e5..329d7fdd0a6a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -16,12 +16,14 @@ #include <linux/init.h> #include <linux/memblock.h> #include <linux/sizes.h> +#include <linux/cma.h> #include <asm/cputable.h> #include <asm/kvm_ppc.h> #include <asm/kvm_book3s.h> -#include "book3s_hv_cma.h" +#define KVM_CMA_CHUNK_ORDER 18 + /* * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) * should be power of 2. @@ -43,6 +45,8 @@ static unsigned long kvm_cma_resv_ratio = 5; unsigned long kvm_rma_pages = (1 << 27) >> PAGE_SHIFT; /* 128MB */ EXPORT_SYMBOL_GPL(kvm_rma_pages); +static struct cma *kvm_cma; + /* Work out RMLS (real mode limit selector) field value for a given RMA size. Assumes POWER7 or PPC970. */ static inline int lpcr_rmls(unsigned long rma_size) @@ -97,7 +101,7 @@ struct kvm_rma_info *kvm_alloc_rma() ri = kmalloc(sizeof(struct kvm_rma_info), GFP_KERNEL); if (!ri) return NULL; - page = kvm_alloc_cma(kvm_rma_pages, kvm_rma_pages); + page = cma_alloc(kvm_cma, kvm_rma_pages, get_order(kvm_rma_pages)); if (!page) goto err_out; atomic_set(&ri->use_count, 1); @@ -112,7 +116,7 @@ EXPORT_SYMBOL_GPL(kvm_alloc_rma); void kvm_release_rma(struct kvm_rma_info *ri) { if (atomic_dec_and_test(&ri->use_count)) { - kvm_release_cma(pfn_to_page(ri->base_pfn), kvm_rma_pages); + cma_release(kvm_cma, pfn_to_page(ri->base_pfn), kvm_rma_pages); kfree(ri); } } @@ -131,16 +135,18 @@ struct page *kvm_alloc_hpt(unsigned long nr_pages) { unsigned long align_pages = HPT_ALIGN_PAGES; + VM_BUG_ON(get_order(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + /* Old CPUs require HPT aligned on a multiple of its size */ if (!cpu_has_feature(CPU_FTR_ARCH_206)) align_pages = nr_pages; - return kvm_alloc_cma(nr_pages, align_pages); + return cma_alloc(kvm_cma, nr_pages, get_order(align_pages)); } EXPORT_SYMBOL_GPL(kvm_alloc_hpt); void kvm_release_hpt(struct page *page, unsigned long nr_pages) { - kvm_release_cma(page, nr_pages); + cma_release(kvm_cma, page, nr_pages); } EXPORT_SYMBOL_GPL(kvm_release_hpt); @@ -179,7 +185,8 @@ void __init kvm_cma_reserve(void) align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; align_size = max(kvm_rma_pages << PAGE_SHIFT, align_size); - kvm_cma_declare_contiguous(selected_size, align_size); + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); } } diff --git a/arch/powerpc/kvm/book3s_hv_cma.c b/arch/powerpc/kvm/book3s_hv_cma.c deleted file mode 100644 index d9d3d8553d51..000000000000 --- a/arch/powerpc/kvm/book3s_hv_cma.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ -#define pr_fmt(fmt) "kvm_cma: " fmt - -#ifdef CONFIG_CMA_DEBUG -#ifndef DEBUG -# define DEBUG -#endif -#endif - -#include <linux/memblock.h> -#include <linux/mutex.h> -#include <linux/sizes.h> -#include <linux/slab.h> - -#include "book3s_hv_cma.h" - -struct kvm_cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; -}; - -static DEFINE_MUTEX(kvm_cma_mutex); -static struct kvm_cma kvm_cma_area; - -/** - * kvm_cma_declare_contiguous() - reserve area for contiguous memory handling - * for kvm hash pagetable - * @size: Size of the reserved memory. - * @alignment: Alignment for the contiguous memory area - * - * This function reserves memory for kvm cma area. It should be - * called by arch code when early allocator (memblock or bootmem) - * is still activate. - */ -long __init kvm_cma_declare_contiguous(phys_addr_t size, phys_addr_t alignment) -{ - long base_pfn; - phys_addr_t addr; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s(size %lx)\n", __func__, (unsigned long)size); - - if (!size) - return -EINVAL; - /* - * Sanitise input arguments. - * We should be pageblock aligned for CMA. - */ - alignment = max(alignment, (phys_addr_t)(PAGE_SIZE << pageblock_order)); - size = ALIGN(size, alignment); - /* - * Reserve memory - * Use __memblock_alloc_base() since - * memblock_alloc_base() panic()s. - */ - addr = __memblock_alloc_base(size, alignment, 0); - if (!addr) { - base_pfn = -ENOMEM; - goto err; - } else - base_pfn = PFN_DOWN(addr); - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = base_pfn; - cma->count = size >> PAGE_SHIFT; - pr_info("CMA: reserved %ld MiB\n", (unsigned long)size / SZ_1M); - return 0; -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return base_pfn; -} - -/** - * kvm_alloc_cma() - allocate pages from contiguous area - * @nr_pages: Requested number of pages. - * @align_pages: Requested alignment in number of pages - * - * This function allocates memory buffer for hash pagetable. - */ -struct page *kvm_alloc_cma(unsigned long nr_pages, unsigned long align_pages) -{ - int ret; - struct page *page = NULL; - struct kvm_cma *cma = &kvm_cma_area; - unsigned long chunk_count, nr_chunk; - unsigned long mask, pfn, pageno, start = 0; - - - if (!cma || !cma->count) - return NULL; - - pr_debug("%s(cma %p, count %lu, align pages %lu)\n", __func__, - (void *)cma, nr_pages, align_pages); - - if (!nr_pages) - return NULL; - /* - * align mask with chunk size. The bit tracks pages in chunk size - */ - VM_BUG_ON(!is_power_of_2(align_pages)); - mask = (align_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)) - 1; - BUILD_BUG_ON(PAGE_SHIFT > KVM_CMA_CHUNK_ORDER); - - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - for (;;) { - pageno = bitmap_find_next_zero_area(cma->bitmap, chunk_count, - start, nr_chunk, mask); - if (pageno >= chunk_count) - break; - - pfn = cma->base_pfn + (pageno << (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT)); - ret = alloc_contig_range(pfn, pfn + nr_pages, MIGRATE_CMA); - if (ret == 0) { - bitmap_set(cma->bitmap, pageno, nr_chunk); - page = pfn_to_page(pfn); - memset(pfn_to_kaddr(pfn), 0, nr_pages << PAGE_SHIFT); - break; - } else if (ret != -EBUSY) { - break; - } - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = pageno + mask + 1; - } - mutex_unlock(&kvm_cma_mutex); - pr_debug("%s(): returned %p\n", __func__, page); - return page; -} - -/** - * kvm_release_cma() - release allocated pages for hash pagetable - * @pages: Allocated pages. - * @nr_pages: Number of allocated pages. - * - * This function releases memory allocated by kvm_alloc_cma(). - * It returns false when provided pages do not belong to contiguous area and - * true otherwise. - */ -bool kvm_release_cma(struct page *pages, unsigned long nr_pages) -{ - unsigned long pfn; - unsigned long nr_chunk; - struct kvm_cma *cma = &kvm_cma_area; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p count %lu)\n", __func__, (void *)pages, nr_pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + nr_pages > cma->base_pfn + cma->count); - nr_chunk = nr_pages >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - - mutex_lock(&kvm_cma_mutex); - bitmap_clear(cma->bitmap, - (pfn - cma->base_pfn) >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT), - nr_chunk); - free_contig_range(pfn, nr_pages); - mutex_unlock(&kvm_cma_mutex); - - return true; -} - -static int __init kvm_cma_activate_area(unsigned long base_pfn, - unsigned long count) -{ - unsigned long pfn = base_pfn; - unsigned i = count >> pageblock_order; - struct zone *zone; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - return -EINVAL; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - return 0; -} - -static int __init kvm_cma_init_reserved_areas(void) -{ - int bitmap_size, ret; - unsigned long chunk_count; - struct kvm_cma *cma = &kvm_cma_area; - - pr_debug("%s()\n", __func__); - if (!cma->count) - return 0; - chunk_count = cma->count >> (KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); - bitmap_size = BITS_TO_LONGS(chunk_count) * sizeof(long); - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!cma->bitmap) - return -ENOMEM; - - ret = kvm_cma_activate_area(cma->base_pfn, cma->count); - if (ret) - goto error; - return 0; - -error: - kfree(cma->bitmap); - return ret; -} -core_initcall(kvm_cma_init_reserved_areas); diff --git a/arch/powerpc/kvm/book3s_hv_cma.h b/arch/powerpc/kvm/book3s_hv_cma.h deleted file mode 100644 index 655144f75fa5..000000000000 --- a/arch/powerpc/kvm/book3s_hv_cma.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Contiguous Memory Allocator for ppc KVM hash pagetable based on CMA - * for DMA mapping framework - * - * Copyright IBM Corporation, 2013 - * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License or (at your optional) any later version of the license. - * - */ - -#ifndef __POWERPC_KVM_CMA_ALLOC_H__ -#define __POWERPC_KVM_CMA_ALLOC_H__ -/* - * Both RMA and Hash page allocation will be multiple of 256K. - */ -#define KVM_CMA_CHUNK_ORDER 18 - -extern struct page *kvm_alloc_cma(unsigned long nr_pages, - unsigned long align_pages); -extern bool kvm_release_cma(struct page *pages, unsigned long nr_pages); -extern long kvm_cma_declare_contiguous(phys_addr_t size, - phys_addr_t alignment) __init; -#endif diff --git a/arch/powerpc/mm/dma-noncoherent.c b/arch/powerpc/mm/dma-noncoherent.c index 7b6c10750179..d85e86aac7fb 100644 --- a/arch/powerpc/mm/dma-noncoherent.c +++ b/arch/powerpc/mm/dma-noncoherent.c @@ -33,6 +33,7 @@ #include <linux/export.h> #include <asm/tlbflush.h> +#include <asm/dma.h> #include "mmu_decl.h" diff --git a/arch/powerpc/platforms/44x/warp.c b/arch/powerpc/platforms/44x/warp.c index 534574a97ec9..3a104284b338 100644 --- a/arch/powerpc/platforms/44x/warp.c +++ b/arch/powerpc/platforms/44x/warp.c @@ -25,6 +25,7 @@ #include <asm/time.h> #include <asm/uic.h> #include <asm/ppc4xx.h> +#include <asm/dma.h> static __initdata struct of_device_id warp_of_bus[] = { diff --git a/arch/powerpc/platforms/52xx/efika.c b/arch/powerpc/platforms/52xx/efika.c index 6e19b0ad5d26..3feffde9128d 100644 --- a/arch/powerpc/platforms/52xx/efika.c +++ b/arch/powerpc/platforms/52xx/efika.c @@ -13,6 +13,7 @@ #include <generated/utsrelease.h> #include <linux/pci.h> #include <linux/of.h> +#include <asm/dma.h> #include <asm/prom.h> #include <asm/time.h> #include <asm/machdep.h> diff --git a/arch/powerpc/platforms/amigaone/setup.c b/arch/powerpc/platforms/amigaone/setup.c index 03aabc0e16ac..2fe12046279e 100644 --- a/arch/powerpc/platforms/amigaone/setup.c +++ b/arch/powerpc/platforms/amigaone/setup.c @@ -24,6 +24,7 @@ #include <asm/i8259.h> #include <asm/time.h> #include <asm/udbg.h> +#include <asm/dma.h> extern void __flush_disable_L1(void); diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index bb63499fc5d3..4ae4af600d7d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -48,6 +48,8 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC config KEXEC def_bool y + select CRYPTO + select CRYPTO_SHA256 config AUDIT_ARCH def_bool y @@ -146,6 +148,7 @@ config S390 select TTY select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS + select ARCH_HAS_SG_CHAIN config SCHED_OMIT_FRAME_POINTER def_bool y diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 57892a8a9055..b3fea0722ff1 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -4,4 +4,5 @@ generic-y += clkdev.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h diff --git a/arch/s390/include/asm/scatterlist.h b/arch/s390/include/asm/scatterlist.h deleted file mode 100644 index 6d45ef6c12a7..000000000000 --- a/arch/s390/include/asm/scatterlist.h +++ /dev/null @@ -1,3 +0,0 @@ -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN diff --git a/arch/score/include/asm/Kbuild b/arch/score/include/asm/Kbuild index 2f947aba4bd4..aad209199f7e 100644 --- a/arch/score/include/asm/Kbuild +++ b/arch/score/include/asm/Kbuild @@ -8,5 +8,6 @@ generic-y += cputime.h generic-y += hash.h generic-y += mcs_spinlock.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += trace_clock.h generic-y += xor.h diff --git a/arch/score/include/asm/scatterlist.h b/arch/score/include/asm/scatterlist.h deleted file mode 100644 index 9f533b8362c7..000000000000 --- a/arch/score/include/asm/scatterlist.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCORE_SCATTERLIST_H -#define _ASM_SCORE_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#endif /* _ASM_SCORE_SCATTERLIST_H */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 834b67c4db5a..09e12e1908a2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -596,6 +596,8 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call (EXPERIMENTAL)" depends on SUPERH32 && MMU + select CRYPTO + select CRYPTO_SHA256 help kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/sh/Makefile b/arch/sh/Makefile index d4d16e4be07c..bf5b3f5f4962 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -32,7 +32,8 @@ endif cflags-$(CONFIG_CPU_SH2) := $(call cc-option,-m2,) cflags-$(CONFIG_CPU_SH2A) += $(call cc-option,-m2a,) \ - $(call cc-option,-m2a-nofpu,) + $(call cc-option,-m2a-nofpu,) \ + $(call cc-option,-m4-nofpu,) cflags-$(CONFIG_CPU_SH3) := $(call cc-option,-m3,) cflags-$(CONFIG_CPU_SH4) := $(call cc-option,-m4,) \ $(call cc-option,-mno-implicit-fp,-m4-nofpu) diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig index cfd5b90a8628..78bc97b1d027 100644 --- a/arch/sh/drivers/dma/Kconfig +++ b/arch/sh/drivers/dma/Kconfig @@ -12,9 +12,8 @@ config SH_DMA_IRQ_MULTI default y if CPU_SUBTYPE_SH7750 || CPU_SUBTYPE_SH7751 || \ CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \ CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091 || \ - CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7764 || \ - CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ - CPU_SUBTYPE_SH7760 + CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ + CPU_SUBTYPE_SH7785 || CPU_SUBTYPE_SH7760 config SH_DMA_API depends on SH_DMA diff --git a/arch/sh/include/cpu-sh4/cpu/dma-register.h b/arch/sh/include/cpu-sh4/cpu/dma-register.h index 02788b6a03b7..9cd81e54056a 100644 --- a/arch/sh/include/cpu-sh4/cpu/dma-register.h +++ b/arch/sh/include/cpu-sh4/cpu/dma-register.h @@ -32,7 +32,6 @@ #define CHCR_TS_HIGH_SHIFT (20 - 2) /* 2 bits for shifted low TS */ #elif defined(CONFIG_CPU_SUBTYPE_SH7757) || \ defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) || \ defined(CONFIG_CPU_SUBTYPE_SH7780) || \ defined(CONFIG_CPU_SUBTYPE_SH7785) #define CHCR_TS_LOW_MASK 0x00000018 diff --git a/arch/sh/include/cpu-sh4a/cpu/dma.h b/arch/sh/include/cpu-sh4a/cpu/dma.h index 89afb650ce25..8ceccceae844 100644 --- a/arch/sh/include/cpu-sh4a/cpu/dma.h +++ b/arch/sh/include/cpu-sh4a/cpu/dma.h @@ -14,8 +14,7 @@ #define DMTE4_IRQ evt2irq(0xb80) #define DMAE0_IRQ evt2irq(0xbc0) /* DMA Error IRQ*/ #define SH_DMAC_BASE0 0xFE008020 -#elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ - defined(CONFIG_CPU_SUBTYPE_SH7764) +#elif defined(CONFIG_CPU_SUBTYPE_SH7763) #define DMTE0_IRQ evt2irq(0x640) #define DMTE4_IRQ evt2irq(0x780) #define DMAE0_IRQ evt2irq(0x6c0) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c index f579dd528198..c187b9579c21 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c @@ -307,7 +307,7 @@ static struct clk_lookup lookups[] = { CLKDEV_ICK_ID("fck", "sh-tmu.0", &mstp_clks[HWBLK_TMU0]), CLKDEV_ICK_ID("fck", "sh-tmu.1", &mstp_clks[HWBLK_TMU1]), - CLKDEV_ICK_ID("fck", "sh-cmt-16.0", &mstp_clks[HWBLK_CMT]), + CLKDEV_ICK_ID("fck", "sh-cmt-32.0", &mstp_clks[HWBLK_CMT]), CLKDEV_DEV_ID("sh-wdt.0", &mstp_clks[HWBLK_RWDT]), CLKDEV_DEV_ID("sh-dma-engine.1", &mstp_clks[HWBLK_DMAC1]), @@ -332,6 +332,8 @@ static struct clk_lookup lookups[] = { CLKDEV_CON_ID("tsif0", &mstp_clks[HWBLK_TSIF]), CLKDEV_DEV_ID("renesas_usbhs.1", &mstp_clks[HWBLK_USB1]), CLKDEV_DEV_ID("renesas_usbhs.0", &mstp_clks[HWBLK_USB0]), + CLKDEV_CON_ID("usb1", &mstp_clks[HWBLK_USB1]), + CLKDEV_CON_ID("usb0", &mstp_clks[HWBLK_USB0]), CLKDEV_CON_ID("2dg0", &mstp_clks[HWBLK_2DG]), CLKDEV_DEV_ID("sh_mobile_sdhi.0", &mstp_clks[HWBLK_SDHI0]), CLKDEV_DEV_ID("sh_mobile_sdhi.1", &mstp_clks[HWBLK_SDHI1]), diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 552c8fcf9416..d6d0a986c6e9 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -80,10 +80,8 @@ static int __init rtc_generic_init(void) return -ENODEV; pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); - if (IS_ERR(pdev)) - return PTR_ERR(pdev); - return 0; + return PTR_ERR_OR_ZERO(pdev); } module_init(rtc_generic_init); diff --git a/arch/sh/mm/asids-debugfs.c b/arch/sh/mm/asids-debugfs.c index 74c03ecc4871..ecfc6b0c1da1 100644 --- a/arch/sh/mm/asids-debugfs.c +++ b/arch/sh/mm/asids-debugfs.c @@ -67,10 +67,8 @@ static int __init asids_debugfs_init(void) NULL, &asids_debugfs_fops); if (!asids_dentry) return -ENOMEM; - if (IS_ERR(asids_dentry)) - return PTR_ERR(asids_dentry); - return 0; + return PTR_ERR_OR_ZERO(asids_dentry); } module_init(asids_debugfs_init); diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 407c87d9879a..bff3192219df 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -42,6 +42,7 @@ config SPARC select MODULES_USE_ELF_RELA select ODD_RT_SIGACTION select OLD_SIGSUSPEND + select ARCH_HAS_SG_CHAIN config SPARC32 def_bool !64BIT diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index a45821818003..cdd1b447bb6c 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,6 +15,7 @@ generic-y += mcs_spinlock.h generic-y += module.h generic-y += mutex.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += serial.h generic-y += trace_clock.h generic-y += types.h diff --git a/arch/sparc/include/asm/scatterlist.h b/arch/sparc/include/asm/scatterlist.h deleted file mode 100644 index 92bb638313f8..000000000000 --- a/arch/sparc/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _SPARC_SCATTERLIST_H -#define _SPARC_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN - -#endif /* !(_SPARC_SCATTERLIST_H) */ diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 4f3006b600e3..a37cdfb331e9 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -192,6 +192,8 @@ source "kernel/Kconfig.hz" config KEXEC bool "kexec system call" + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 4918d91bc3a6..d19b13e3a59f 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -58,7 +58,7 @@ void *module_alloc(unsigned long size) area->nr_pages = npages; area->pages = pages; - if (map_vm_area(area, prot_rwx, &pages)) { + if (map_vm_area(area, prot_rwx, pages)) { vunmap(area->addr); goto error; } diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index a5e4b6068213..7bd64aa2e94a 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -21,6 +21,7 @@ generic-y += param.h generic-y += pci.h generic-y += percpu.h generic-y += preempt.h +generic-y += scatterlist.h generic-y += sections.h generic-y += switch_to.h generic-y += topology.h diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index e5287d8517aa..61b6d51866f8 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -16,3 +16,7 @@ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ obj-y += net/ + +ifeq ($(CONFIG_X86_64),y) +obj-$(CONFIG_KEXEC) += purgatory/ +endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66be0176be29..d520d333a1b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -96,6 +96,7 @@ config X86 select IRQ_FORCED_THREADING select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select ARCH_HAS_SG_CHAIN select CLKEVT_I8253 select ARCH_HAVE_NMI_SAFE_CMPXCHG select GENERIC_IOMAP @@ -1579,6 +1580,9 @@ source kernel/Kconfig.hz config KEXEC bool "kexec system call" + select BUILD_BIN2C + select CRYPTO + select CRYPTO_SHA256 ---help--- kexec is a system call that implements the ability to shutdown your current kernel, and to start another kernel. It is like a reboot diff --git a/arch/x86/Makefile b/arch/x86/Makefile index c65fd9650467..c1aa36887843 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -183,6 +183,14 @@ archscripts: scripts_basic archheaders: $(Q)$(MAKE) $(build)=arch/x86/syscalls all +archprepare: +ifeq ($(CONFIG_KEXEC),y) +# Build only for 64bit. No loaders for 32bit yet. + ifeq ($(CONFIG_X86_64),y) + $(Q)$(MAKE) $(build)=arch/x86/purgatory arch/x86/purgatory/kexec-purgatory.c + endif +endif + ### # Kernel objects diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 3ca9762e1649..3bf000fab0ae 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -5,6 +5,7 @@ genhdr-y += unistd_64.h genhdr-y += unistd_x32.h generic-y += clkdev.h -generic-y += early_ioremap.h generic-y += cputime.h +generic-y += early_ioremap.h generic-y += mcs_spinlock.h +generic-y += scatterlist.h diff --git a/arch/x86/include/asm/crash.h b/arch/x86/include/asm/crash.h new file mode 100644 index 000000000000..f498411f2500 --- /dev/null +++ b/arch/x86/include/asm/crash.h @@ -0,0 +1,9 @@ +#ifndef _ASM_X86_CRASH_H +#define _ASM_X86_CRASH_H + +int crash_load_segments(struct kimage *image); +int crash_copy_backup_region(struct kimage *image); +int crash_setup_memmap_entries(struct kimage *image, + struct boot_params *params); + +#endif /* _ASM_X86_CRASH_H */ diff --git a/arch/x86/include/asm/kexec-bzimage64.h b/arch/x86/include/asm/kexec-bzimage64.h new file mode 100644 index 000000000000..d1b5d194e31d --- /dev/null +++ b/arch/x86/include/asm/kexec-bzimage64.h @@ -0,0 +1,6 @@ +#ifndef _ASM_KEXEC_BZIMAGE64_H +#define _ASM_KEXEC_BZIMAGE64_H + +extern struct kexec_file_ops kexec_bzImage64_ops; + +#endif /* _ASM_KEXE_BZIMAGE64_H */ diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h index 17483a492f18..d2434c1cad05 100644 --- a/arch/x86/include/asm/kexec.h +++ b/arch/x86/include/asm/kexec.h @@ -23,6 +23,9 @@ #include <asm/page.h> #include <asm/ptrace.h> +#include <asm/bootparam.h> + +struct kimage; /* * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return. @@ -61,6 +64,10 @@ # define KEXEC_ARCH KEXEC_ARCH_X86_64 #endif +/* Memory to backup during crash kdump */ +#define KEXEC_BACKUP_SRC_START (0UL) +#define KEXEC_BACKUP_SRC_END (640 * 1024UL) /* 640K */ + /* * CPU does not save ss and sp on stack if execution is already * running in kernel mode at the time of NMI occurrence. This code @@ -160,6 +167,44 @@ struct kimage_arch { pud_t *pud; pmd_t *pmd; pte_t *pte; + /* Details of backup region */ + unsigned long backup_src_start; + unsigned long backup_src_sz; + + /* Physical address of backup segment */ + unsigned long backup_load_addr; + + /* Core ELF header buffer */ + void *elf_headers; + unsigned long elf_headers_sz; + unsigned long elf_load_addr; +}; +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_X86_64 +/* + * Number of elements and order of elements in this structure should match + * with the ones in arch/x86/purgatory/entry64.S. If you make a change here + * make an appropriate change in purgatory too. + */ +struct kexec_entry64_regs { + uint64_t rax; + uint64_t rcx; + uint64_t rdx; + uint64_t rbx; + uint64_t rsp; + uint64_t rbp; + uint64_t rsi; + uint64_t rdi; + uint64_t r8; + uint64_t r9; + uint64_t r10; + uint64_t r11; + uint64_t r12; + uint64_t r13; + uint64_t r14; + uint64_t r15; + uint64_t rip; }; #endif diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h index 4064acae625d..01b493e5a99b 100644 --- a/arch/x86/include/asm/numa.h +++ b/arch/x86/include/asm/numa.h @@ -9,7 +9,6 @@ #ifdef CONFIG_NUMA #define NR_NODE_MEMBLKS (MAX_NUMNODES*2) -#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT)) /* * Too small node sizes may confuse the VM badly. Usually they diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 5be9063545d2..809abb335627 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -115,7 +115,8 @@ static inline void native_pgd_clear(pgd_t *pgd) native_set_pgd(pgd, native_make_pgd(0)); } -extern void sync_global_pgds(unsigned long start, unsigned long end); +extern void sync_global_pgds(unsigned long start, unsigned long end, + int removed); /* * Conversion functions: convert a page and protection to a page entry, diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h deleted file mode 100644 index 4240878b9d76..000000000000 --- a/arch/x86/include/asm/scatterlist.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _ASM_X86_SCATTERLIST_H -#define _ASM_X86_SCATTERLIST_H - -#include <asm-generic/scatterlist.h> - -#define ARCH_HAS_SG_CHAIN - -#endif /* _ASM_X86_SCATTERLIST_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 047f9ff2e36c..ece67cbe139d 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -117,4 +117,5 @@ ifeq ($(CONFIG_X86_64),y) obj-$(CONFIG_PCI_MMCONFIG) += mmconf-fam10h_64.o obj-y += vsmp_64.o + obj-$(CONFIG_KEXEC) += kexec-bzimage64.o endif diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 507de8066594..0553a34fa0df 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -4,9 +4,14 @@ * Created by: Hariprasad Nellitheertha (hari@in.ibm.com) * * Copyright (C) IBM Corporation, 2004. All rights reserved. + * Copyright (C) Red Hat Inc., 2014. All rights reserved. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> * */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/smp.h> @@ -16,6 +21,7 @@ #include <linux/elf.h> #include <linux/elfcore.h> #include <linux/module.h> +#include <linux/slab.h> #include <asm/processor.h> #include <asm/hardirq.h> @@ -28,6 +34,45 @@ #include <asm/reboot.h> #include <asm/virtext.h> +/* Alignment required for elf header segment */ +#define ELF_CORE_HEADER_ALIGN 4096 + +/* This primarily represents number of split ranges due to exclusion */ +#define CRASH_MAX_RANGES 16 + +struct crash_mem_range { + u64 start, end; +}; + +struct crash_mem { + unsigned int nr_ranges; + struct crash_mem_range ranges[CRASH_MAX_RANGES]; +}; + +/* Misc data about ram ranges needed to prepare elf headers */ +struct crash_elf_data { + struct kimage *image; + /* + * Total number of ram ranges we have after various adjustments for + * GART, crash reserved region etc. + */ + unsigned int max_nr_ranges; + unsigned long gart_start, gart_end; + + /* Pointer to elf header */ + void *ehdr; + /* Pointer to next phdr */ + void *bufp; + struct crash_mem mem; +}; + +/* Used while preparing memory map entries for second kernel */ +struct crash_memmap_data { + struct boot_params *params; + /* Type of memory */ + unsigned int type; +}; + int in_crash_kexec; /* @@ -39,6 +84,7 @@ int in_crash_kexec; */ crash_vmclear_fn __rcu *crash_vmclear_loaded_vmcss = NULL; EXPORT_SYMBOL_GPL(crash_vmclear_loaded_vmcss); +unsigned long crash_zero_bytes; static inline void cpu_crash_vmclear_loaded_vmcss(void) { @@ -135,3 +181,520 @@ void native_machine_crash_shutdown(struct pt_regs *regs) #endif crash_save_cpu(regs, safe_smp_processor_id()); } + +#ifdef CONFIG_X86_64 + +static int get_nr_ram_ranges_callback(unsigned long start_pfn, + unsigned long nr_pfn, void *arg) +{ + int *nr_ranges = arg; + + (*nr_ranges)++; + return 0; +} + +static int get_gart_ranges_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + + ced->gart_start = start; + ced->gart_end = end; + + /* Not expecting more than 1 gart aperture */ + return 1; +} + + +/* Gather all the required information to prepare elf headers for ram regions */ +static void fill_up_crash_elf_data(struct crash_elf_data *ced, + struct kimage *image) +{ + unsigned int nr_ranges = 0; + + ced->image = image; + + walk_system_ram_range(0, -1, &nr_ranges, + get_nr_ram_ranges_callback); + + ced->max_nr_ranges = nr_ranges; + + /* + * We don't create ELF headers for GART aperture as an attempt + * to dump this memory in second kernel leads to hang/crash. + * If gart aperture is present, one needs to exclude that region + * and that could lead to need of extra phdr. + */ + walk_iomem_res("GART", IORESOURCE_MEM, 0, -1, + ced, get_gart_ranges_callback); + + /* + * If we have gart region, excluding that could potentially split + * a memory range, resulting in extra header. Account for that. + */ + if (ced->gart_end) + ced->max_nr_ranges++; + + /* Exclusion of crash region could split memory ranges */ + ced->max_nr_ranges++; + + /* If crashk_low_res is not 0, another range split possible */ + if (crashk_low_res.end != 0) + ced->max_nr_ranges++; +} + +static int exclude_mem_range(struct crash_mem *mem, + unsigned long long mstart, unsigned long long mend) +{ + int i, j; + unsigned long long start, end; + struct crash_mem_range temp_range = {0, 0}; + + for (i = 0; i < mem->nr_ranges; i++) { + start = mem->ranges[i].start; + end = mem->ranges[i].end; + + if (mstart > end || mend < start) + continue; + + /* Truncate any area outside of range */ + if (mstart < start) + mstart = start; + if (mend > end) + mend = end; + + /* Found completely overlapping range */ + if (mstart == start && mend == end) { + mem->ranges[i].start = 0; + mem->ranges[i].end = 0; + if (i < mem->nr_ranges - 1) { + /* Shift rest of the ranges to left */ + for (j = i; j < mem->nr_ranges - 1; j++) { + mem->ranges[j].start = + mem->ranges[j+1].start; + mem->ranges[j].end = + mem->ranges[j+1].end; + } + } + mem->nr_ranges--; + return 0; + } + + if (mstart > start && mend < end) { + /* Split original range */ + mem->ranges[i].end = mstart - 1; + temp_range.start = mend + 1; + temp_range.end = end; + } else if (mstart != start) + mem->ranges[i].end = mstart - 1; + else + mem->ranges[i].start = mend + 1; + break; + } + + /* If a split happend, add the split to array */ + if (!temp_range.end) + return 0; + + /* Split happened */ + if (i == CRASH_MAX_RANGES - 1) { + pr_err("Too many crash ranges after split\n"); + return -ENOMEM; + } + + /* Location where new range should go */ + j = i + 1; + if (j < mem->nr_ranges) { + /* Move over all ranges one slot towards the end */ + for (i = mem->nr_ranges - 1; i >= j; i--) + mem->ranges[i + 1] = mem->ranges[i]; + } + + mem->ranges[j].start = temp_range.start; + mem->ranges[j].end = temp_range.end; + mem->nr_ranges++; + return 0; +} + +/* + * Look for any unwanted ranges between mstart, mend and remove them. This + * might lead to split and split ranges are put in ced->mem.ranges[] array + */ +static int elf_header_exclude_ranges(struct crash_elf_data *ced, + unsigned long long mstart, unsigned long long mend) +{ + struct crash_mem *cmem = &ced->mem; + int ret = 0; + + memset(cmem->ranges, 0, sizeof(cmem->ranges)); + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude crashkernel region */ + ret = exclude_mem_range(cmem, crashk_res.start, crashk_res.end); + if (ret) + return ret; + + ret = exclude_mem_range(cmem, crashk_low_res.start, crashk_low_res.end); + if (ret) + return ret; + + /* Exclude GART region */ + if (ced->gart_end) { + ret = exclude_mem_range(cmem, ced->gart_start, ced->gart_end); + if (ret) + return ret; + } + + return ret; +} + +static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg) +{ + struct crash_elf_data *ced = arg; + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long mstart, mend; + struct kimage *image = ced->image; + struct crash_mem *cmem; + int ret, i; + + ehdr = ced->ehdr; + + /* Exclude unwanted mem ranges */ + ret = elf_header_exclude_ranges(ced, start, end); + if (ret) + return ret; + + /* Go through all the ranges in ced->mem.ranges[] and prepare phdr */ + cmem = &ced->mem; + + for (i = 0; i < cmem->nr_ranges; i++) { + mstart = cmem->ranges[i].start; + mend = cmem->ranges[i].end; + + phdr = ced->bufp; + ced->bufp += sizeof(Elf64_Phdr); + + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_offset = mstart; + + /* + * If a range matches backup region, adjust offset to backup + * segment. + */ + if (mstart == image->arch.backup_src_start && + (mend - mstart + 1) == image->arch.backup_src_sz) + phdr->p_offset = image->arch.backup_load_addr; + + phdr->p_paddr = mstart; + phdr->p_vaddr = (unsigned long long) __va(mstart); + phdr->p_filesz = phdr->p_memsz = mend - mstart + 1; + phdr->p_align = 0; + ehdr->e_phnum++; + pr_debug("Crash PT_LOAD elf header. phdr=%p vaddr=0x%llx, paddr=0x%llx, sz=0x%llx e_phnum=%d p_offset=0x%llx\n", + phdr, phdr->p_vaddr, phdr->p_paddr, phdr->p_filesz, + ehdr->e_phnum, phdr->p_offset); + } + + return ret; +} + +static int prepare_elf64_headers(struct crash_elf_data *ced, + void **addr, unsigned long *sz) +{ + Elf64_Ehdr *ehdr; + Elf64_Phdr *phdr; + unsigned long nr_cpus = num_possible_cpus(), nr_phdr, elf_sz; + unsigned char *buf, *bufp; + unsigned int cpu; + unsigned long long notes_addr; + int ret; + + /* extra phdr for vmcoreinfo elf note */ + nr_phdr = nr_cpus + 1; + nr_phdr += ced->max_nr_ranges; + + /* + * kexec-tools creates an extra PT_LOAD phdr for kernel text mapping + * area on x86_64 (ffffffff80000000 - ffffffffa0000000). + * I think this is required by tools like gdb. So same physical + * memory will be mapped in two elf headers. One will contain kernel + * text virtual addresses and other will have __va(physical) addresses. + */ + + nr_phdr++; + elf_sz = sizeof(Elf64_Ehdr) + nr_phdr * sizeof(Elf64_Phdr); + elf_sz = ALIGN(elf_sz, ELF_CORE_HEADER_ALIGN); + + buf = vzalloc(elf_sz); + if (!buf) + return -ENOMEM; + + bufp = buf; + ehdr = (Elf64_Ehdr *)bufp; + bufp += sizeof(Elf64_Ehdr); + memcpy(ehdr->e_ident, ELFMAG, SELFMAG); + ehdr->e_ident[EI_CLASS] = ELFCLASS64; + ehdr->e_ident[EI_DATA] = ELFDATA2LSB; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_OSABI] = ELF_OSABI; + memset(ehdr->e_ident + EI_PAD, 0, EI_NIDENT - EI_PAD); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_phoff = sizeof(Elf64_Ehdr); + ehdr->e_ehsize = sizeof(Elf64_Ehdr); + ehdr->e_phentsize = sizeof(Elf64_Phdr); + + /* Prepare one phdr of type PT_NOTE for each present cpu */ + for_each_present_cpu(cpu) { + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + notes_addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpu)); + phdr->p_offset = phdr->p_paddr = notes_addr; + phdr->p_filesz = phdr->p_memsz = sizeof(note_buf_t); + (ehdr->e_phnum)++; + } + + /* Prepare one PT_NOTE header for vmcoreinfo */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_NOTE; + phdr->p_offset = phdr->p_paddr = paddr_vmcoreinfo_note(); + phdr->p_filesz = phdr->p_memsz = sizeof(vmcoreinfo_note); + (ehdr->e_phnum)++; + +#ifdef CONFIG_X86_64 + /* Prepare PT_LOAD type program header for kernel text region */ + phdr = (Elf64_Phdr *)bufp; + bufp += sizeof(Elf64_Phdr); + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (Elf64_Addr)_text; + phdr->p_filesz = phdr->p_memsz = _end - _text; + phdr->p_offset = phdr->p_paddr = __pa_symbol(_text); + (ehdr->e_phnum)++; +#endif + + /* Prepare PT_LOAD headers for system ram chunks. */ + ced->ehdr = ehdr; + ced->bufp = bufp; + ret = walk_system_ram_res(0, -1, ced, + prepare_elf64_ram_headers_callback); + if (ret < 0) + return ret; + + *addr = buf; + *sz = elf_sz; + return 0; +} + +/* Prepare elf headers. Return addr and size */ +static int prepare_elf_headers(struct kimage *image, void **addr, + unsigned long *sz) +{ + struct crash_elf_data *ced; + int ret; + + ced = kzalloc(sizeof(*ced), GFP_KERNEL); + if (!ced) + return -ENOMEM; + + fill_up_crash_elf_data(ced, image); + + /* By default prepare 64bit headers */ + ret = prepare_elf64_headers(ced, addr, sz); + kfree(ced); + return ret; +} + +static int add_e820_entry(struct boot_params *params, struct e820entry *entry) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = params->e820_entries; + if (nr_e820_entries >= E820MAX) + return 1; + + memcpy(¶ms->e820_map[nr_e820_entries], entry, + sizeof(struct e820entry)); + params->e820_entries++; + return 0; +} + +static int memmap_entry_callback(u64 start, u64 end, void *arg) +{ + struct crash_memmap_data *cmd = arg; + struct boot_params *params = cmd->params; + struct e820entry ei; + + ei.addr = start; + ei.size = end - start + 1; + ei.type = cmd->type; + add_e820_entry(params, &ei); + + return 0; +} + +static int memmap_exclude_ranges(struct kimage *image, struct crash_mem *cmem, + unsigned long long mstart, + unsigned long long mend) +{ + unsigned long start, end; + int ret = 0; + + cmem->ranges[0].start = mstart; + cmem->ranges[0].end = mend; + cmem->nr_ranges = 1; + + /* Exclude Backup region */ + start = image->arch.backup_load_addr; + end = start + image->arch.backup_src_sz - 1; + ret = exclude_mem_range(cmem, start, end); + if (ret) + return ret; + + /* Exclude elf header region */ + start = image->arch.elf_load_addr; + end = start + image->arch.elf_headers_sz - 1; + return exclude_mem_range(cmem, start, end); +} + +/* Prepare memory map for crash dump kernel */ +int crash_setup_memmap_entries(struct kimage *image, struct boot_params *params) +{ + int i, ret = 0; + unsigned long flags; + struct e820entry ei; + struct crash_memmap_data cmd; + struct crash_mem *cmem; + + cmem = vzalloc(sizeof(struct crash_mem)); + if (!cmem) + return -ENOMEM; + + memset(&cmd, 0, sizeof(struct crash_memmap_data)); + cmd.params = params; + + /* Add first 640K segment */ + ei.addr = image->arch.backup_src_start; + ei.size = image->arch.backup_src_sz; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + + /* Add ACPI tables */ + cmd.type = E820_ACPI; + flags = IORESOURCE_MEM | IORESOURCE_BUSY; + walk_iomem_res("ACPI Tables", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add ACPI Non-volatile Storage */ + cmd.type = E820_NVS; + walk_iomem_res("ACPI Non-volatile Storage", flags, 0, -1, &cmd, + memmap_entry_callback); + + /* Add crashk_low_res region */ + if (crashk_low_res.end) { + ei.addr = crashk_low_res.start; + ei.size = crashk_low_res.end - crashk_low_res.start + 1; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + + /* Exclude some ranges from crashk_res and add rest to memmap */ + ret = memmap_exclude_ranges(image, cmem, crashk_res.start, + crashk_res.end); + if (ret) + goto out; + + for (i = 0; i < cmem->nr_ranges; i++) { + ei.size = cmem->ranges[i].end - cmem->ranges[i].start + 1; + + /* If entry is less than a page, skip it */ + if (ei.size < PAGE_SIZE) + continue; + ei.addr = cmem->ranges[i].start; + ei.type = E820_RAM; + add_e820_entry(params, &ei); + } + +out: + vfree(cmem); + return ret; +} + +static int determine_backup_region(u64 start, u64 end, void *arg) +{ + struct kimage *image = arg; + + image->arch.backup_src_start = start; + image->arch.backup_src_sz = end - start + 1; + + /* Expecting only one range for backup region */ + return 1; +} + +int crash_load_segments(struct kimage *image) +{ + unsigned long src_start, src_sz, elf_sz; + void *elf_addr; + int ret; + + /* + * Determine and load a segment for backup area. First 640K RAM + * region is backup source + */ + + ret = walk_system_ram_res(KEXEC_BACKUP_SRC_START, KEXEC_BACKUP_SRC_END, + image, determine_backup_region); + + /* Zero or postive return values are ok */ + if (ret < 0) + return ret; + + src_start = image->arch.backup_src_start; + src_sz = image->arch.backup_src_sz; + + /* Add backup segment. */ + if (src_sz) { + /* + * Ideally there is no source for backup segment. This is + * copied in purgatory after crash. Just add a zero filled + * segment for now to make sure checksum logic works fine. + */ + ret = kexec_add_buffer(image, (char *)&crash_zero_bytes, + sizeof(crash_zero_bytes), src_sz, + PAGE_SIZE, 0, -1, 0, + &image->arch.backup_load_addr); + if (ret) + return ret; + pr_debug("Loaded backup region at 0x%lx backup_start=0x%lx memsz=0x%lx\n", + image->arch.backup_load_addr, src_start, src_sz); + } + + /* Prepare elf headers and add a segment */ + ret = prepare_elf_headers(image, &elf_addr, &elf_sz); + if (ret) + return ret; + + image->arch.elf_headers = elf_addr; + image->arch.elf_headers_sz = elf_sz; + + ret = kexec_add_buffer(image, (char *)elf_addr, elf_sz, elf_sz, + ELF_CORE_HEADER_ALIGN, 0, -1, 0, + &image->arch.elf_load_addr); + if (ret) { + vfree((void *)image->arch.elf_headers); + return ret; + } + pr_debug("Loaded ELF headers at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + image->arch.elf_load_addr, elf_sz, elf_sz); + + return ret; +} + +#endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c new file mode 100644 index 000000000000..623e6c58081f --- /dev/null +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -0,0 +1,532 @@ +/* + * Kexec bzImage loader + * + * Copyright (C) 2014 Red Hat Inc. + * Authors: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#define pr_fmt(fmt) "kexec-bzImage64: " fmt + +#include <linux/string.h> +#include <linux/printk.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/kexec.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/efi.h> + +#include <asm/bootparam.h> +#include <asm/setup.h> +#include <asm/crash.h> +#include <asm/efi.h> + +#define MAX_ELFCOREHDR_STR_LEN 30 /* elfcorehdr=0x<64bit-value> */ + +/* + * Defines lowest physical address for various segments. Not sure where + * exactly these limits came from. Current bzimage64 loader in kexec-tools + * uses these so I am retaining it. It can be changed over time as we gain + * more insight. + */ +#define MIN_PURGATORY_ADDR 0x3000 +#define MIN_BOOTPARAM_ADDR 0x3000 +#define MIN_KERNEL_LOAD_ADDR 0x100000 +#define MIN_INITRD_LOAD_ADDR 0x1000000 + +/* + * This is a place holder for all boot loader specific data structure which + * gets allocated in one call but gets freed much later during cleanup + * time. Right now there is only one field but it can grow as need be. + */ +struct bzimage64_data { + /* + * Temporary buffer to hold bootparams buffer. This should be + * freed once the bootparam segment has been loaded. + */ + void *bootparams_buf; +}; + +static int setup_initrd(struct boot_params *params, + unsigned long initrd_load_addr, unsigned long initrd_len) +{ + params->hdr.ramdisk_image = initrd_load_addr & 0xffffffffUL; + params->hdr.ramdisk_size = initrd_len & 0xffffffffUL; + + params->ext_ramdisk_image = initrd_load_addr >> 32; + params->ext_ramdisk_size = initrd_len >> 32; + + return 0; +} + +static int setup_cmdline(struct kimage *image, struct boot_params *params, + unsigned long bootparams_load_addr, + unsigned long cmdline_offset, char *cmdline, + unsigned long cmdline_len) +{ + char *cmdline_ptr = ((char *)params) + cmdline_offset; + unsigned long cmdline_ptr_phys, len; + uint32_t cmdline_low_32, cmdline_ext_32; + + memcpy(cmdline_ptr, cmdline, cmdline_len); + if (image->type == KEXEC_TYPE_CRASH) { + len = sprintf(cmdline_ptr + cmdline_len - 1, + " elfcorehdr=0x%lx", image->arch.elf_load_addr); + cmdline_len += len; + } + cmdline_ptr[cmdline_len - 1] = '\0'; + + pr_debug("Final command line is: %s\n", cmdline_ptr); + cmdline_ptr_phys = bootparams_load_addr + cmdline_offset; + cmdline_low_32 = cmdline_ptr_phys & 0xffffffffUL; + cmdline_ext_32 = cmdline_ptr_phys >> 32; + + params->hdr.cmd_line_ptr = cmdline_low_32; + if (cmdline_ext_32) + params->ext_cmd_line_ptr = cmdline_ext_32; + + return 0; +} + +static int setup_e820_entries(struct boot_params *params) +{ + unsigned int nr_e820_entries; + + nr_e820_entries = e820_saved.nr_map; + + /* TODO: Pass entries more than E820MAX in bootparams setup data */ + if (nr_e820_entries > E820MAX) + nr_e820_entries = E820MAX; + + params->e820_entries = nr_e820_entries; + memcpy(¶ms->e820_map, &e820_saved.map, + nr_e820_entries * sizeof(struct e820entry)); + + return 0; +} + +#ifdef CONFIG_EFI +static int setup_efi_info_memmap(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, + unsigned int efi_map_sz) +{ + void *efi_map = (void *)params + efi_map_offset; + unsigned long efi_map_phys_addr = params_load_addr + efi_map_offset; + struct efi_info *ei = ¶ms->efi_info; + + if (!efi_map_sz) + return 0; + + efi_runtime_map_copy(efi_map, efi_map_sz); + + ei->efi_memmap = efi_map_phys_addr & 0xffffffff; + ei->efi_memmap_hi = efi_map_phys_addr >> 32; + ei->efi_memmap_size = efi_map_sz; + + return 0; +} + +static int +prepare_add_efi_setup_data(struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_setup_data_offset) +{ + unsigned long setup_data_phys; + struct setup_data *sd = (void *)params + efi_setup_data_offset; + struct efi_setup_data *esd = (void *)sd + sizeof(struct setup_data); + + esd->fw_vendor = efi.fw_vendor; + esd->runtime = efi.runtime; + esd->tables = efi.config_table; + esd->smbios = efi.smbios; + + sd->type = SETUP_EFI; + sd->len = sizeof(struct efi_setup_data); + + /* Add setup data */ + setup_data_phys = params_load_addr + efi_setup_data_offset; + sd->next = params->hdr.setup_data; + params->hdr.setup_data = setup_data_phys; + + return 0; +} + +static int +setup_efi_state(struct boot_params *params, unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + struct efi_info *current_ei = &boot_params.efi_info; + struct efi_info *ei = ¶ms->efi_info; + + if (!current_ei->efi_memmap_size) + return 0; + + /* + * If 1:1 mapping is not enabled, second kernel can not setup EFI + * and use EFI run time services. User space will have to pass + * acpi_rsdp=<addr> on kernel command line to make second kernel boot + * without efi. + */ + if (efi_enabled(EFI_OLD_MEMMAP)) + return 0; + + ei->efi_loader_signature = current_ei->efi_loader_signature; + ei->efi_systab = current_ei->efi_systab; + ei->efi_systab_hi = current_ei->efi_systab_hi; + + ei->efi_memdesc_version = current_ei->efi_memdesc_version; + ei->efi_memdesc_size = efi_get_runtime_map_desc_size(); + + setup_efi_info_memmap(params, params_load_addr, efi_map_offset, + efi_map_sz); + prepare_add_efi_setup_data(params, params_load_addr, + efi_setup_data_offset); + return 0; +} +#endif /* CONFIG_EFI */ + +static int +setup_boot_parameters(struct kimage *image, struct boot_params *params, + unsigned long params_load_addr, + unsigned int efi_map_offset, unsigned int efi_map_sz, + unsigned int efi_setup_data_offset) +{ + unsigned int nr_e820_entries; + unsigned long long mem_k, start, end; + int i, ret = 0; + + /* Get subarch from existing bootparams */ + params->hdr.hardware_subarch = boot_params.hdr.hardware_subarch; + + /* Copying screen_info will do? */ + memcpy(¶ms->screen_info, &boot_params.screen_info, + sizeof(struct screen_info)); + + /* Fill in memsize later */ + params->screen_info.ext_mem_k = 0; + params->alt_mem_k = 0; + + /* Default APM info */ + memset(¶ms->apm_bios_info, 0, sizeof(params->apm_bios_info)); + + /* Default drive info */ + memset(¶ms->hd0_info, 0, sizeof(params->hd0_info)); + memset(¶ms->hd1_info, 0, sizeof(params->hd1_info)); + + /* Default sysdesc table */ + params->sys_desc_table.length = 0; + + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_setup_memmap_entries(image, params); + if (ret) + return ret; + } else + setup_e820_entries(params); + + nr_e820_entries = params->e820_entries; + + for (i = 0; i < nr_e820_entries; i++) { + if (params->e820_map[i].type != E820_RAM) + continue; + start = params->e820_map[i].addr; + end = params->e820_map[i].addr + params->e820_map[i].size - 1; + + if ((start <= 0x100000) && end > 0x100000) { + mem_k = (end >> 10) - (0x100000 >> 10); + params->screen_info.ext_mem_k = mem_k; + params->alt_mem_k = mem_k; + if (mem_k > 0xfc00) + params->screen_info.ext_mem_k = 0xfc00; /* 64M*/ + if (mem_k > 0xffffffff) + params->alt_mem_k = 0xffffffff; + } + } + +#ifdef CONFIG_EFI + /* Setup EFI state */ + setup_efi_state(params, params_load_addr, efi_map_offset, efi_map_sz, + efi_setup_data_offset); +#endif + + /* Setup EDD info */ + memcpy(params->eddbuf, boot_params.eddbuf, + EDDMAXNR * sizeof(struct edd_info)); + params->eddbuf_entries = boot_params.eddbuf_entries; + + memcpy(params->edd_mbr_sig_buffer, boot_params.edd_mbr_sig_buffer, + EDD_MBR_SIG_MAX * sizeof(unsigned int)); + + return ret; +} + +int bzImage64_probe(const char *buf, unsigned long len) +{ + int ret = -ENOEXEC; + struct setup_header *header; + + /* kernel should be atleast two sectors long */ + if (len < 2 * 512) { + pr_err("File is too short to be a bzImage\n"); + return ret; + } + + header = (struct setup_header *)(buf + offsetof(struct boot_params, hdr)); + if (memcmp((char *)&header->header, "HdrS", 4) != 0) { + pr_err("Not a bzImage\n"); + return ret; + } + + if (header->boot_flag != 0xAA55) { + pr_err("No x86 boot sector present\n"); + return ret; + } + + if (header->version < 0x020C) { + pr_err("Must be at least protocol version 2.12\n"); + return ret; + } + + if (!(header->loadflags & LOADED_HIGH)) { + pr_err("zImage not a bzImage\n"); + return ret; + } + + if (!(header->xloadflags & XLF_KERNEL_64)) { + pr_err("Not a bzImage64. XLF_KERNEL_64 is not set.\n"); + return ret; + } + + if (!(header->xloadflags & XLF_CAN_BE_LOADED_ABOVE_4G)) { + pr_err("XLF_CAN_BE_LOADED_ABOVE_4G is not set.\n"); + return ret; + } + + /* + * Can't handle 32bit EFI as it does not allow loading kernel + * above 4G. This should be handled by 32bit bzImage loader + */ + if (efi_enabled(EFI_RUNTIME_SERVICES) && !efi_enabled(EFI_64BIT)) { + pr_debug("EFI is 32 bit. Can't load kernel above 4G.\n"); + return ret; + } + + /* I've got a bzImage */ + pr_debug("It's a relocatable bzImage64\n"); + ret = 0; + + return ret; +} + +void *bzImage64_load(struct kimage *image, char *kernel, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len) +{ + + struct setup_header *header; + int setup_sects, kern16_size, ret = 0; + unsigned long setup_header_size, params_cmdline_sz, params_misc_sz; + struct boot_params *params; + unsigned long bootparam_load_addr, kernel_load_addr, initrd_load_addr; + unsigned long purgatory_load_addr; + unsigned long kernel_bufsz, kernel_memsz, kernel_align; + char *kernel_buf; + struct bzimage64_data *ldata; + struct kexec_entry64_regs regs64; + void *stack; + unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); + unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; + + header = (struct setup_header *)(kernel + setup_hdr_offset); + setup_sects = header->setup_sects; + if (setup_sects == 0) + setup_sects = 4; + + kern16_size = (setup_sects + 1) * 512; + if (kernel_len < kern16_size) { + pr_err("bzImage truncated\n"); + return ERR_PTR(-ENOEXEC); + } + + if (cmdline_len > header->cmdline_size) { + pr_err("Kernel command line too long\n"); + return ERR_PTR(-EINVAL); + } + + /* + * In case of crash dump, we will append elfcorehdr=<addr> to + * command line. Make sure it does not overflow + */ + if (cmdline_len + MAX_ELFCOREHDR_STR_LEN > header->cmdline_size) { + pr_debug("Appending elfcorehdr=<addr> to command line exceeds maximum allowed length\n"); + return ERR_PTR(-EINVAL); + } + + /* Allocate and load backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = crash_load_segments(image); + if (ret) + return ERR_PTR(ret); + } + + /* + * Load purgatory. For 64bit entry point, purgatory code can be + * anywhere. + */ + ret = kexec_load_purgatory(image, MIN_PURGATORY_ADDR, ULONG_MAX, 1, + &purgatory_load_addr); + if (ret) { + pr_err("Loading purgatory failed\n"); + return ERR_PTR(ret); + } + + pr_debug("Loaded purgatory at 0x%lx\n", purgatory_load_addr); + + + /* + * Load Bootparams and cmdline and space for efi stuff. + * + * Allocate memory together for multiple data structures so + * that they all can go in single area/segment and we don't + * have to create separate segment for each. Keeps things + * little bit simple + */ + efi_map_sz = efi_get_runtime_map_size(); + efi_map_sz = ALIGN(efi_map_sz, 16); + params_cmdline_sz = sizeof(struct boot_params) + cmdline_len + + MAX_ELFCOREHDR_STR_LEN; + params_cmdline_sz = ALIGN(params_cmdline_sz, 16); + params_misc_sz = params_cmdline_sz + efi_map_sz + + sizeof(struct setup_data) + + sizeof(struct efi_setup_data); + + params = kzalloc(params_misc_sz, GFP_KERNEL); + if (!params) + return ERR_PTR(-ENOMEM); + efi_map_offset = params_cmdline_sz; + efi_setup_data_offset = efi_map_offset + efi_map_sz; + + /* Copy setup header onto bootparams. Documentation/x86/boot.txt */ + setup_header_size = 0x0202 + kernel[0x0201] - setup_hdr_offset; + + /* Is there a limit on setup header size? */ + memcpy(¶ms->hdr, (kernel + setup_hdr_offset), setup_header_size); + + ret = kexec_add_buffer(image, (char *)params, params_misc_sz, + params_misc_sz, 16, MIN_BOOTPARAM_ADDR, + ULONG_MAX, 1, &bootparam_load_addr); + if (ret) + goto out_free_params; + pr_debug("Loaded boot_param, command line and misc at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + bootparam_load_addr, params_misc_sz, params_misc_sz); + + /* Load kernel */ + kernel_buf = kernel + kern16_size; + kernel_bufsz = kernel_len - kern16_size; + kernel_memsz = PAGE_ALIGN(header->init_size); + kernel_align = header->kernel_alignment; + + ret = kexec_add_buffer(image, kernel_buf, + kernel_bufsz, kernel_memsz, kernel_align, + MIN_KERNEL_LOAD_ADDR, ULONG_MAX, 1, + &kernel_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded 64bit kernel at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + kernel_load_addr, kernel_memsz, kernel_memsz); + + /* Load initrd high */ + if (initrd) { + ret = kexec_add_buffer(image, initrd, initrd_len, initrd_len, + PAGE_SIZE, MIN_INITRD_LOAD_ADDR, + ULONG_MAX, 1, &initrd_load_addr); + if (ret) + goto out_free_params; + + pr_debug("Loaded initrd at 0x%lx bufsz=0x%lx memsz=0x%lx\n", + initrd_load_addr, initrd_len, initrd_len); + + setup_initrd(params, initrd_load_addr, initrd_len); + } + + setup_cmdline(image, params, bootparam_load_addr, + sizeof(struct boot_params), cmdline, cmdline_len); + + /* bootloader info. Do we need a separate ID for kexec kernel loader? */ + params->hdr.type_of_loader = 0x0D << 4; + params->hdr.loadflags = 0; + + /* Setup purgatory regs for entry */ + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 1); + if (ret) + goto out_free_params; + + regs64.rbx = 0; /* Bootstrap Processor */ + regs64.rsi = bootparam_load_addr; + regs64.rip = kernel_load_addr + 0x200; + stack = kexec_purgatory_get_symbol_addr(image, "stack_end"); + if (IS_ERR(stack)) { + pr_err("Could not find address of symbol stack_end\n"); + ret = -EINVAL; + goto out_free_params; + } + + regs64.rsp = (unsigned long)stack; + ret = kexec_purgatory_get_set_symbol(image, "entry64_regs", ®s64, + sizeof(regs64), 0); + if (ret) + goto out_free_params; + + ret = setup_boot_parameters(image, params, bootparam_load_addr, + efi_map_offset, efi_map_sz, + efi_setup_data_offset); + if (ret) + goto out_free_params; + + /* Allocate loader specific data */ + ldata = kzalloc(sizeof(struct bzimage64_data), GFP_KERNEL); + if (!ldata) { + ret = -ENOMEM; + goto out_free_params; + } + + /* + * Store pointer to params so that it could be freed after loading + * params segment has been loaded and contents have been copied + * somewhere else. + */ + ldata->bootparams_buf = params; + return ldata; + +out_free_params: + kfree(params); + return ERR_PTR(ret); +} + +/* This cleanup function is called after various segments have been loaded */ +int bzImage64_cleanup(void *loader_data) +{ + struct bzimage64_data *ldata = loader_data; + + if (!ldata) + return 0; + + kfree(ldata->bootparams_buf); + ldata->bootparams_buf = NULL; + + return 0; +} + +struct kexec_file_ops kexec_bzImage64_ops = { + .probe = bzImage64_probe, + .load = bzImage64_load, + .cleanup = bzImage64_cleanup, +}; diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 679cef0791cd..9330434da777 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/mm.h> #include <linux/kexec.h> #include <linux/string.h> @@ -21,6 +23,11 @@ #include <asm/tlbflush.h> #include <asm/mmu_context.h> #include <asm/debugreg.h> +#include <asm/kexec-bzimage64.h> + +static struct kexec_file_ops *kexec_file_loaders[] = { + &kexec_bzImage64_ops, +}; static void free_transition_pgtable(struct kimage *image) { @@ -171,6 +178,38 @@ static void load_segments(void) ); } +/* Update purgatory as needed after various image segments have been prepared */ +static int arch_update_purgatory(struct kimage *image) +{ + int ret = 0; + + if (!image->file_mode) + return 0; + + /* Setup copying of backup region */ + if (image->type == KEXEC_TYPE_CRASH) { + ret = kexec_purgatory_get_set_symbol(image, "backup_dest", + &image->arch.backup_load_addr, + sizeof(image->arch.backup_load_addr), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_src", + &image->arch.backup_src_start, + sizeof(image->arch.backup_src_start), 0); + if (ret) + return ret; + + ret = kexec_purgatory_get_set_symbol(image, "backup_sz", + &image->arch.backup_src_sz, + sizeof(image->arch.backup_src_sz), 0); + if (ret) + return ret; + } + + return ret; +} + int machine_kexec_prepare(struct kimage *image) { unsigned long start_pgtable; @@ -184,6 +223,11 @@ int machine_kexec_prepare(struct kimage *image) if (result) return result; + /* update purgatory as needed */ + result = arch_update_purgatory(image); + if (result) + return result; + return 0; } @@ -283,3 +327,187 @@ void arch_crash_save_vmcoreinfo(void) (unsigned long)&_text - __START_KERNEL); } +/* arch-dependent functionality related to kexec file-based syscall */ + +int arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + int i, ret = -ENOEXEC; + struct kexec_file_ops *fops; + + for (i = 0; i < ARRAY_SIZE(kexec_file_loaders); i++) { + fops = kexec_file_loaders[i]; + if (!fops || !fops->probe) + continue; + + ret = fops->probe(buf, buf_len); + if (!ret) { + image->fops = fops; + return ret; + } + } + + return ret; +} + +void *arch_kexec_kernel_image_load(struct kimage *image) +{ + vfree(image->arch.elf_headers); + image->arch.elf_headers = NULL; + + if (!image->fops || !image->fops->load) + return ERR_PTR(-ENOEXEC); + + return image->fops->load(image, image->kernel_buf, + image->kernel_buf_len, image->initrd_buf, + image->initrd_buf_len, image->cmdline_buf, + image->cmdline_buf_len); +} + +int arch_kimage_file_post_load_cleanup(struct kimage *image) +{ + if (!image->fops || !image->fops->cleanup) + return 0; + + return image->fops->cleanup(image->image_loader_data); +} + +/* + * Apply purgatory relocations. + * + * ehdr: Pointer to elf headers + * sechdrs: Pointer to section headers. + * relsec: section index of SHT_RELA section. + * + * TODO: Some of the code belongs to generic code. Move that in kexec.c. + */ +int arch_kexec_apply_relocations_add(const Elf64_Ehdr *ehdr, + Elf64_Shdr *sechdrs, unsigned int relsec) +{ + unsigned int i; + Elf64_Rela *rel; + Elf64_Sym *sym; + void *location; + Elf64_Shdr *section, *symtabsec; + unsigned long address, sec_base, value; + const char *strtab, *name, *shstrtab; + + /* + * ->sh_offset has been modified to keep the pointer to section + * contents in memory + */ + rel = (void *)sechdrs[relsec].sh_offset; + + /* Section to which relocations apply */ + section = &sechdrs[sechdrs[relsec].sh_info]; + + pr_debug("Applying relocate section %u to %u\n", relsec, + sechdrs[relsec].sh_info); + + /* Associated symbol table */ + symtabsec = &sechdrs[sechdrs[relsec].sh_link]; + + /* String table */ + if (symtabsec->sh_link >= ehdr->e_shnum) { + /* Invalid strtab section number */ + pr_err("Invalid string table section index %d\n", + symtabsec->sh_link); + return -ENOEXEC; + } + + strtab = (char *)sechdrs[symtabsec->sh_link].sh_offset; + + /* section header string table */ + shstrtab = (char *)sechdrs[ehdr->e_shstrndx].sh_offset; + + for (i = 0; i < sechdrs[relsec].sh_size / sizeof(*rel); i++) { + + /* + * rel[i].r_offset contains byte offset from beginning + * of section to the storage unit affected. + * + * This is location to update (->sh_offset). This is temporary + * buffer where section is currently loaded. This will finally + * be loaded to a different address later, pointed to by + * ->sh_addr. kexec takes care of moving it + * (kexec_load_segment()). + */ + location = (void *)(section->sh_offset + rel[i].r_offset); + + /* Final address of the location */ + address = section->sh_addr + rel[i].r_offset; + + /* + * rel[i].r_info contains information about symbol table index + * w.r.t which relocation must be made and type of relocation + * to apply. ELF64_R_SYM() and ELF64_R_TYPE() macros get + * these respectively. + */ + sym = (Elf64_Sym *)symtabsec->sh_offset + + ELF64_R_SYM(rel[i].r_info); + + if (sym->st_name) + name = strtab + sym->st_name; + else + name = shstrtab + sechdrs[sym->st_shndx].sh_name; + + pr_debug("Symbol: %s info: %02x shndx: %02x value=%llx size: %llx\n", + name, sym->st_info, sym->st_shndx, sym->st_value, + sym->st_size); + + if (sym->st_shndx == SHN_UNDEF) { + pr_err("Undefined symbol: %s\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_COMMON) { + pr_err("symbol '%s' in common section\n", name); + return -ENOEXEC; + } + + if (sym->st_shndx == SHN_ABS) + sec_base = 0; + else if (sym->st_shndx >= ehdr->e_shnum) { + pr_err("Invalid section %d for symbol %s\n", + sym->st_shndx, name); + return -ENOEXEC; + } else + sec_base = sechdrs[sym->st_shndx].sh_addr; + + value = sym->st_value; + value += sec_base; + value += rel[i].r_addend; + + switch (ELF64_R_TYPE(rel[i].r_info)) { + case R_X86_64_NONE: + break; + case R_X86_64_64: + *(u64 *)location = value; + break; + case R_X86_64_32: + *(u32 *)location = value; + if (value != *(u32 *)location) + goto overflow; + break; + case R_X86_64_32S: + *(s32 *)location = value; + if ((s64)value != *(s32 *)location) + goto overflow; + break; + case R_X86_64_PC32: + value -= (u64)address; + *(u32 *)location = value; + break; + default: + pr_err("Unknown rela relocation: %llu\n", + ELF64_R_TYPE(rel[i].r_info)); + return -ENOEXEC; + } + } + return 0; + +overflow: + pr_err("Overflow in relocation type %d value 0x%lx\n", + (int)ELF64_R_TYPE(rel[i].r_info), value); + return -ENOEXEC; +} diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 1dbade870f90..d393ac669cc0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -350,7 +350,7 @@ out: void vmalloc_sync_all(void) { - sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END); + sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0); } /* @@ -1218,7 +1218,8 @@ good_area: /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo - * the fault: + * the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if + * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked. */ fault = handle_mm_fault(mm, vma, address, flags); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index df1a9927ad29..8f6803290083 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -178,7 +178,7 @@ __setup("noexec32=", nonx32_setup); * When memory was added/removed make sure all the processes MM have * suitable PGD entries in the local PGD level page. */ -void sync_global_pgds(unsigned long start, unsigned long end) +void sync_global_pgds(unsigned long start, unsigned long end, int removed) { unsigned long address; @@ -186,7 +186,12 @@ void sync_global_pgds(unsigned long start, unsigned long end) const pgd_t *pgd_ref = pgd_offset_k(address); struct page *page; - if (pgd_none(*pgd_ref)) + /* + * When it is called after memory hot remove, pgd_none() + * returns true. In this case (removed == 1), we must clear + * the PGD entries in the local PGD level page. + */ + if (pgd_none(*pgd_ref) && !removed) continue; spin_lock(&pgd_lock); @@ -199,12 +204,18 @@ void sync_global_pgds(unsigned long start, unsigned long end) pgt_lock = &pgd_page_get_mm(page)->page_table_lock; spin_lock(pgt_lock); - if (pgd_none(*pgd)) - set_pgd(pgd, *pgd_ref); - else + if (!pgd_none(*pgd_ref) && !pgd_none(*pgd)) BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref)); + if (removed) { + if (pgd_none(*pgd_ref) && !pgd_none(*pgd)) + pgd_clear(pgd); + } else { + if (pgd_none(*pgd)) + set_pgd(pgd, *pgd_ref); + } + spin_unlock(pgt_lock); } spin_unlock(&pgd_lock); @@ -633,7 +644,7 @@ kernel_physical_mapping_init(unsigned long start, } if (pgd_changed) - sync_global_pgds(addr, end - 1); + sync_global_pgds(addr, end - 1, 0); __flush_tlb_all(); @@ -975,25 +986,26 @@ static void __meminit remove_pagetable(unsigned long start, unsigned long end, bool direct) { unsigned long next; + unsigned long addr; pgd_t *pgd; pud_t *pud; bool pgd_changed = false; - for (; start < end; start = next) { - next = pgd_addr_end(start, end); + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); - pgd = pgd_offset_k(start); + pgd = pgd_offset_k(addr); if (!pgd_present(*pgd)) continue; pud = (pud_t *)pgd_page_vaddr(*pgd); - remove_pud_table(pud, start, next, direct); + remove_pud_table(pud, addr, next, direct); if (free_pud_table(pud, pgd)) pgd_changed = true; } if (pgd_changed) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 1); flush_tlb_all(); } @@ -1340,7 +1352,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) else err = vmemmap_populate_basepages(start, end, node); if (!err) - sync_global_pgds(start, end - 1); + sync_global_pgds(start, end - 1, 0); return err; } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index a32b706c401a..d221374d5ce8 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -185,8 +185,8 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) return numa_add_memblk_to(nid, start, end, &numa_meminfo); } -/* Initialize NODE_DATA for a node on the local memory */ -static void __init setup_node_data(int nid, u64 start, u64 end) +/* Allocate NODE_DATA for a node on the local memory */ +static void __init alloc_node_data(int nid) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); u64 nd_pa; @@ -194,18 +194,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) int tnid; /* - * Don't confuse VM with a node that doesn't have the - * minimum amount of memory: - */ - if (end && (end - start) < NODE_MIN_SIZE) - return; - - start = roundup(start, ZONE_ALIGN); - - printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", - nid, start, end - 1); - - /* * Allocate node data. Try node-local memory and then any node. * Never allocate in DMA zone. */ @@ -222,7 +210,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + printk(KERN_INFO "NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid, nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); if (tnid != nid) @@ -230,9 +218,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) node_data[nid] = nd; memset(NODE_DATA(nid), 0, sizeof(pg_data_t)); - NODE_DATA(nid)->node_id = nid; - NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT; - NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT; node_set_online(nid); } @@ -523,8 +508,17 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) end = max(mi->blk[i].end, end); } - if (start < end) - setup_node_data(nid, start, end); + if (start >= end) + continue; + + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + continue; + + alloc_node_data(nid); } /* Dump memblock with node info and return. */ diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile new file mode 100644 index 000000000000..7fde9ee438a4 --- /dev/null +++ b/arch/x86/purgatory/Makefile @@ -0,0 +1,30 @@ +purgatory-y := purgatory.o stack.o setup-x86_$(BITS).o sha256.o entry64.o string.o + +targets += $(purgatory-y) +PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y)) + +LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib +targets += purgatory.ro + +# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That +# in turn leaves some undefined symbols like __fentry__ in purgatory and not +# sure how to relocate those. Like kexec-tools, use custom flags. + +KBUILD_CFLAGS := -fno-strict-aliasing -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fno-builtin -ffreestanding -c -MD -Os -mcmodel=large + +$(obj)/purgatory.ro: $(PURGATORY_OBJS) FORCE + $(call if_changed,ld) + +targets += kexec-purgatory.c + +quiet_cmd_bin2c = BIN2C $@ + cmd_bin2c = cat $(obj)/purgatory.ro | $(objtree)/scripts/basic/bin2c kexec_purgatory > $(obj)/kexec-purgatory.c + +$(obj)/kexec-purgatory.c: $(obj)/purgatory.ro FORCE + $(call if_changed,bin2c) + + +# No loaders for 32bits yet. +ifeq ($(CONFIG_X86_64),y) + obj-$(CONFIG_KEXEC) += kexec-purgatory.o +endif diff --git a/arch/x86/purgatory/entry64.S b/arch/x86/purgatory/entry64.S new file mode 100644 index 000000000000..d1a4291d3568 --- /dev/null +++ b/arch/x86/purgatory/entry64.S @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + + * Author(s): Vivek Goyal <vgoyal@redhat.com> + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .balign 16 + .code64 + .globl entry64, entry64_regs + + +entry64: + /* Setup a gdt that should be preserved */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup new stack */ + leaq stack_init(%rip), %rsp + pushq $0x10 /* CS */ + leaq new_cs_exit(%rip), %rax + pushq %rax + lretq +new_cs_exit: + + /* Load the registers */ + movq rax(%rip), %rax + movq rbx(%rip), %rbx + movq rcx(%rip), %rcx + movq rdx(%rip), %rdx + movq rsi(%rip), %rsi + movq rdi(%rip), %rdi + movq rsp(%rip), %rsp + movq rbp(%rip), %rbp + movq r8(%rip), %r8 + movq r9(%rip), %r9 + movq r10(%rip), %r10 + movq r11(%rip), %r11 + movq r12(%rip), %r12 + movq r13(%rip), %r13 + movq r14(%rip), %r14 + movq r15(%rip), %r15 + + /* Jump to the new code... */ + jmpq *rip(%rip) + + .section ".rodata" + .balign 4 +entry64_regs: +rax: .quad 0x0 +rcx: .quad 0x0 +rdx: .quad 0x0 +rbx: .quad 0x0 +rsp: .quad 0x0 +rbp: .quad 0x0 +rsi: .quad 0x0 +rdi: .quad 0x0 +r8: .quad 0x0 +r9: .quad 0x0 +r10: .quad 0x0 +r11: .quad 0x0 +r12: .quad 0x0 +r13: .quad 0x0 +r14: .quad 0x0 +r15: .quad 0x0 +rip: .quad 0x0 + .size entry64_regs, . - entry64_regs + + /* GDT */ + .section ".rodata" + .balign 16 +gdt: + /* 0x00 unusable segment + * 0x08 unused + * so use them as gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: +stack: .quad 0, 0 +stack_init: diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c new file mode 100644 index 000000000000..25e068ba3382 --- /dev/null +++ b/arch/x86/purgatory/purgatory.c @@ -0,0 +1,72 @@ +/* + * purgatory: Runs between two kernels + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "sha256.h" +#include "../boot/string.h" + +struct sha_region { + unsigned long start; + unsigned long len; +}; + +unsigned long backup_dest = 0; +unsigned long backup_src = 0; +unsigned long backup_sz = 0; + +u8 sha256_digest[SHA256_DIGEST_SIZE] = { 0 }; + +struct sha_region sha_regions[16] = {}; + +/* + * On x86, second kernel requries first 640K of memory to boot. Copy + * first 640K to a backup region in reserved memory range so that second + * kernel can use first 640K. + */ +static int copy_backup_region(void) +{ + if (backup_dest) + memcpy((void *)backup_dest, (void *)backup_src, backup_sz); + + return 0; +} + +int verify_sha256_digest(void) +{ + struct sha_region *ptr, *end; + u8 digest[SHA256_DIGEST_SIZE]; + struct sha256_state sctx; + + sha256_init(&sctx); + end = &sha_regions[sizeof(sha_regions)/sizeof(sha_regions[0])]; + for (ptr = sha_regions; ptr < end; ptr++) + sha256_update(&sctx, (uint8_t *)(ptr->start), ptr->len); + + sha256_final(&sctx, digest); + + if (memcmp(digest, sha256_digest, sizeof(digest))) + return 1; + + return 0; +} + +void purgatory(void) +{ + int ret; + + ret = verify_sha256_digest(); + if (ret) { + /* loop forever */ + for (;;) + ; + } + copy_backup_region(); +} diff --git a/arch/x86/purgatory/setup-x86_64.S b/arch/x86/purgatory/setup-x86_64.S new file mode 100644 index 000000000000..fe3c91ba1bd0 --- /dev/null +++ b/arch/x86/purgatory/setup-x86_64.S @@ -0,0 +1,58 @@ +/* + * purgatory: setup code + * + * Copyright (C) 2003,2004 Eric Biederman (ebiederm@xmission.com) + * Copyright (C) 2014 Red Hat Inc. + * + * This code has been taken from kexec-tools. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + .text + .globl purgatory_start + .balign 16 +purgatory_start: + .code64 + + /* Load a gdt so I know what the segment registers are */ + lgdt gdt(%rip) + + /* load the data segments */ + movl $0x18, %eax /* data segment */ + movl %eax, %ds + movl %eax, %es + movl %eax, %ss + movl %eax, %fs + movl %eax, %gs + + /* Setup a stack */ + leaq lstack_end(%rip), %rsp + + /* Call the C code */ + call purgatory + jmp entry64 + + .section ".rodata" + .balign 16 +gdt: /* 0x00 unusable segment + * 0x08 unused + * so use them as the gdt ptr + */ + .word gdt_end - gdt - 1 + .quad gdt + .word 0, 0, 0 + + /* 0x10 4GB flat code segment */ + .word 0xFFFF, 0x0000, 0x9A00, 0x00AF + + /* 0x18 4GB flat data segment */ + .word 0xFFFF, 0x0000, 0x9200, 0x00CF +gdt_end: + + .bss + .balign 4096 +lstack: + .skip 4096 +lstack_end: diff --git a/arch/x86/purgatory/sha256.c b/arch/x86/purgatory/sha256.c new file mode 100644 index 000000000000..548ca675a14a --- /dev/null +++ b/arch/x86/purgatory/sha256.c @@ -0,0 +1,283 @@ +/* + * SHA-256, as specified in + * http://csrc.nist.gov/groups/STM/cavp/documents/shs/sha256-384-512.pdf + * + * SHA-256 code by Jean-Luc Cooke <jlcooke@certainkey.com>. + * + * Copyright (c) Jean-Luc Cooke <jlcooke@certainkey.com> + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> + * Copyright (c) 2014 Red Hat Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include <linux/bitops.h> +#include <asm/byteorder.h> +#include "sha256.h" +#include "../boot/string.h" + +static inline u32 Ch(u32 x, u32 y, u32 z) +{ + return z ^ (x & (y ^ z)); +} + +static inline u32 Maj(u32 x, u32 y, u32 z) +{ + return (x & y) | (z & (x | y)); +} + +#define e0(x) (ror32(x, 2) ^ ror32(x, 13) ^ ror32(x, 22)) +#define e1(x) (ror32(x, 6) ^ ror32(x, 11) ^ ror32(x, 25)) +#define s0(x) (ror32(x, 7) ^ ror32(x, 18) ^ (x >> 3)) +#define s1(x) (ror32(x, 17) ^ ror32(x, 19) ^ (x >> 10)) + +static inline void LOAD_OP(int I, u32 *W, const u8 *input) +{ + W[I] = __be32_to_cpu(((__be32 *)(input))[I]); +} + +static inline void BLEND_OP(int I, u32 *W) +{ + W[I] = s1(W[I-2]) + W[I-7] + s0(W[I-15]) + W[I-16]; +} + +static void sha256_transform(u32 *state, const u8 *input) +{ + u32 a, b, c, d, e, f, g, h, t1, t2; + u32 W[64]; + int i; + + /* load the input */ + for (i = 0; i < 16; i++) + LOAD_OP(i, W, input); + + /* now blend */ + for (i = 16; i < 64; i++) + BLEND_OP(i, W); + + /* load the state into our registers */ + a = state[0]; b = state[1]; c = state[2]; d = state[3]; + e = state[4]; f = state[5]; g = state[6]; h = state[7]; + + /* now iterate */ + t1 = h + e1(e) + Ch(e, f, g) + 0x428a2f98 + W[0]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x71374491 + W[1]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb5c0fbcf + W[2]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xe9b5dba5 + W[3]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x3956c25b + W[4]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x59f111f1 + W[5]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x923f82a4 + W[6]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xab1c5ed5 + W[7]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1 + t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xd807aa98 + W[8]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1 + t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x12835b01 + W[9]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1 + t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x243185be + W[10]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1 + t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x550c7dc3 + W[11]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1 + t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x72be5d74 + W[12]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1 + t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x80deb1fe + W[13]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1 + t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x9bdc06a7 + W[14]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1 + t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc19bf174 + W[15]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xe49b69c1 + W[16]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xefbe4786 + W[17]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x0fc19dc6 + W[18]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x240ca1cc + W[19]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x2de92c6f + W[20]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4a7484aa + W[21]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5cb0a9dc + W[22]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x76f988da + W[23]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x983e5152 + W[24]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa831c66d + W[25]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xb00327c8 + W[26]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xbf597fc7 + W[27]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xc6e00bf3 + W[28]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd5a79147 + W[29]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x06ca6351 + W[30]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x14292967 + W[31]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x27b70a85 + W[32]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x2e1b2138 + W[33]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x4d2c6dfc + W[34]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x53380d13 + W[35]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x650a7354 + W[36]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x766a0abb + W[37]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x81c2c92e + W[38]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x92722c85 + W[39]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0xa2bfe8a1 + W[40]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0xa81a664b + W[41]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0xc24b8b70 + W[42]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0xc76c51a3 + W[43]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0xd192e819 + W[44]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xd6990624 + W[45]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xf40e3585 + W[46]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x106aa070 + W[47]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x19a4c116 + W[48]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x1e376c08 + W[49]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x2748774c + W[50]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x34b0bcb5 + W[51]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x391c0cb3 + W[52]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0x4ed8aa4a + W[53]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0x5b9cca4f + W[54]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0x682e6ff3 + W[55]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + t1 = h + e1(e) + Ch(e, f, g) + 0x748f82ee + W[56]; + t2 = e0(a) + Maj(a, b, c); d += t1; h = t1+t2; + t1 = g + e1(d) + Ch(d, e, f) + 0x78a5636f + W[57]; + t2 = e0(h) + Maj(h, a, b); c += t1; g = t1+t2; + t1 = f + e1(c) + Ch(c, d, e) + 0x84c87814 + W[58]; + t2 = e0(g) + Maj(g, h, a); b += t1; f = t1+t2; + t1 = e + e1(b) + Ch(b, c, d) + 0x8cc70208 + W[59]; + t2 = e0(f) + Maj(f, g, h); a += t1; e = t1+t2; + t1 = d + e1(a) + Ch(a, b, c) + 0x90befffa + W[60]; + t2 = e0(e) + Maj(e, f, g); h += t1; d = t1+t2; + t1 = c + e1(h) + Ch(h, a, b) + 0xa4506ceb + W[61]; + t2 = e0(d) + Maj(d, e, f); g += t1; c = t1+t2; + t1 = b + e1(g) + Ch(g, h, a) + 0xbef9a3f7 + W[62]; + t2 = e0(c) + Maj(c, d, e); f += t1; b = t1+t2; + t1 = a + e1(f) + Ch(f, g, h) + 0xc67178f2 + W[63]; + t2 = e0(b) + Maj(b, c, d); e += t1; a = t1+t2; + + state[0] += a; state[1] += b; state[2] += c; state[3] += d; + state[4] += e; state[5] += f; state[6] += g; state[7] += h; + + /* clear any sensitive info... */ + a = b = c = d = e = f = g = h = t1 = t2 = 0; + memset(W, 0, 64 * sizeof(u32)); +} + +int sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; + + return 0; +} + +int sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len) +{ + unsigned int partial, done; + const u8 *src; + + partial = sctx->count & 0x3f; + sctx->count += len; + done = 0; + src = data; + + if ((partial + len) > 63) { + if (partial) { + done = -partial; + memcpy(sctx->buf + partial, data, done + 64); + src = sctx->buf; + } + + do { + sha256_transform(sctx->state, src); + done += 64; + src = data + done; + } while (done + 63 < len); + + partial = 0; + } + memcpy(sctx->buf + partial, src, len - done); + + return 0; +} + +int sha256_final(struct sha256_state *sctx, u8 *out) +{ + __be32 *dst = (__be32 *)out; + __be64 bits; + unsigned int index, pad_len; + int i; + static const u8 padding[64] = { 0x80, }; + + /* Save number of bits */ + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64. */ + index = sctx->count & 0x3f; + pad_len = (index < 56) ? (56 - index) : ((64+56) - index); + sha256_update(sctx, padding, pad_len); + + /* Append length (before padding) */ + sha256_update(sctx, (const u8 *)&bits, sizeof(bits)); + + /* Store state in digest */ + for (i = 0; i < 8; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Zeroize sensitive information. */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} diff --git a/arch/x86/purgatory/sha256.h b/arch/x86/purgatory/sha256.h new file mode 100644 index 000000000000..bd15a4127735 --- /dev/null +++ b/arch/x86/purgatory/sha256.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2014 Red Hat Inc. + * + * Author: Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#ifndef SHA256_H +#define SHA256_H + + +#include <linux/types.h> +#include <crypto/sha.h> + +extern int sha256_init(struct sha256_state *sctx); +extern int sha256_update(struct sha256_state *sctx, const u8 *input, + unsigned int length); +extern int sha256_final(struct sha256_state *sctx, u8 *hash); + +#endif /* SHA256_H */ diff --git a/arch/x86/purgatory/stack.S b/arch/x86/purgatory/stack.S new file mode 100644 index 000000000000..3cefba1fefc8 --- /dev/null +++ b/arch/x86/purgatory/stack.S @@ -0,0 +1,19 @@ +/* + * purgatory: stack + * + * Copyright (C) 2014 Red Hat Inc. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + + /* A stack for the loaded kernel. + * Seperate and in the data section so it can be prepopulated. + */ + .data + .balign 4096 + .globl stack, stack_end + +stack: + .skip 4096 +stack_end: diff --git a/arch/x86/purgatory/string.c b/arch/x86/purgatory/string.c new file mode 100644 index 000000000000..d886b1fa36f0 --- /dev/null +++ b/arch/x86/purgatory/string.c @@ -0,0 +1,13 @@ +/* + * Simple string functions. + * + * Copyright (C) 2014 Red Hat Inc. + * + * Author: + * Vivek Goyal <vgoyal@redhat.com> + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include "../boot/string.c" diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 6705032c0520..17df6122b1e7 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -324,6 +324,7 @@ 315 common sched_getattr sys_sched_getattr 316 common renameat2 sys_renameat2 317 common getrandom sys_getrandom +318 common kexec_file_load sys_kexec_file_load # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/block/bio-integrity.c b/block/bio-integrity.c index bc423f7b02da..38c8ac2970ac 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -646,6 +646,4 @@ void __init bio_integrity_init(void) sizeof(struct bio_integrity_payload) + sizeof(struct bio_vec) * BIP_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); - if (!bip_slab) - panic("Failed to create slab\n"); } diff --git a/block/genhd.c b/block/genhd.c index 791f41943132..7bd4372e8b6f 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -849,7 +849,7 @@ static int show_partition(struct seq_file *seqf, void *v) char buf[BDEVNAME_SIZE]; /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + if (!get_capacity(sgp) || (!(disk_max_parts(sgp) > 1) && (sgp->flags & GENHD_FL_REMOVABLE))) return 0; if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) diff --git a/crypto/zlib.c b/crypto/zlib.c index 06b62e5cdcc7..c9ee681d57fd 100644 --- a/crypto/zlib.c +++ b/crypto/zlib.c @@ -168,7 +168,7 @@ static int zlib_compress_update(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -198,7 +198,7 @@ static int zlib_compress_final(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -283,7 +283,7 @@ static int zlib_decompress_update(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; @@ -331,7 +331,7 @@ static int zlib_decompress_final(struct crypto_pcomp *tfm, } ret = req->avail_out - stream->avail_out; - pr_debug("avail_in %u, avail_out %u (consumed %u, produced %u)\n", + pr_debug("avail_in %lu, avail_out %lu (consumed %lu, produced %u)\n", stream->avail_in, stream->avail_out, req->avail_in - stream->avail_in, ret); req->next_in = stream->next_in; diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index 7671dbac6015..b0d5b5a12147 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -16,6 +16,7 @@ menuconfig ATA depends on BLOCK depends on !(M32R || M68K || S390) || BROKEN select SCSI + select GLOB ---help--- If you want to use an ATA hard disk, ATA tape drive, ATA CD-ROM or any other ATA device under Linux, say Y and make sure that you know diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index d19c37a7abc9..259d87937ce7 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -59,6 +59,7 @@ #include <linux/async.h> #include <linux/log2.h> #include <linux/slab.h> +#include <linux/glob.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> @@ -4250,73 +4251,6 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { { } }; -/** - * glob_match - match a text string against a glob-style pattern - * @text: the string to be examined - * @pattern: the glob-style pattern to be matched against - * - * Either/both of text and pattern can be empty strings. - * - * Match text against a glob-style pattern, with wildcards and simple sets: - * - * ? matches any single character. - * * matches any run of characters. - * [xyz] matches a single character from the set: x, y, or z. - * [a-d] matches a single character from the range: a, b, c, or d. - * [a-d0-9] matches a single character from either range. - * - * The special characters ?, [, -, or *, can be matched using a set, eg. [*] - * Behaviour with malformed patterns is undefined, though generally reasonable. - * - * Sample patterns: "SD1?", "SD1[0-5]", "*R0", "SD*1?[012]*xx" - * - * This function uses one level of recursion per '*' in pattern. - * Since it calls _nothing_ else, and has _no_ explicit local variables, - * this will not cause stack problems for any reasonable use here. - * - * RETURNS: - * 0 on match, 1 otherwise. - */ -static int glob_match (const char *text, const char *pattern) -{ - do { - /* Match single character or a '?' wildcard */ - if (*text == *pattern || *pattern == '?') { - if (!*pattern++) - return 0; /* End of both strings: match */ - } else { - /* Match single char against a '[' bracketed ']' pattern set */ - if (!*text || *pattern != '[') - break; /* Not a pattern set */ - while (*++pattern && *pattern != ']' && *text != *pattern) { - if (*pattern == '-' && *(pattern - 1) != '[') - if (*text > *(pattern - 1) && *text < *(pattern + 1)) { - ++pattern; - break; - } - } - if (!*pattern || *pattern == ']') - return 1; /* No match */ - while (*pattern && *pattern++ != ']'); - } - } while (*++text && *pattern); - - /* Match any run of chars against a '*' wildcard */ - if (*pattern == '*') { - if (!*++pattern) - return 0; /* Match: avoid recursion at end of pattern */ - /* Loop to handle additional pattern chars after the wildcard */ - while (*text) { - if (glob_match(text, pattern) == 0) - return 0; /* Remainder matched */ - ++text; /* Absorb (match) this char and try again */ - } - } - if (!*text && !*pattern) - return 0; /* End of both strings: match */ - return 1; /* No match */ -} - static unsigned long ata_dev_blacklisted(const struct ata_device *dev) { unsigned char model_num[ATA_ID_PROD_LEN + 1]; @@ -4327,10 +4261,10 @@ static unsigned long ata_dev_blacklisted(const struct ata_device *dev) ata_id_c_string(dev->id, model_rev, ATA_ID_FW_REV, sizeof(model_rev)); while (ad->model_num) { - if (!glob_match(model_num, ad->model_num)) { + if (glob_match(model_num, ad->model_num)) { if (ad->model_rev == NULL) return ad->horkage; - if (!glob_match(model_rev, ad->model_rev)) + if (glob_match(model_rev, ad->model_rev)) return ad->horkage; } ad++; diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 88500fed3c7a..4e7f0ff83ae7 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -289,16 +289,6 @@ config CMA_ALIGNMENT If unsure, leave the default value "8". -config CMA_AREAS - int "Maximum count of the CMA device-private areas" - default 7 - help - CMA allows to create CMA areas for particular devices. This parameter - sets the maximum number of such device private CMA areas in the - system. - - If unsure, leave the default value "7". - endif endmenu diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 6467c919c509..6606abdf880c 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -24,23 +24,9 @@ #include <linux/memblock.h> #include <linux/err.h> -#include <linux/mm.h> -#include <linux/mutex.h> -#include <linux/page-isolation.h> #include <linux/sizes.h> -#include <linux/slab.h> -#include <linux/swap.h> -#include <linux/mm_types.h> #include <linux/dma-contiguous.h> - -struct cma { - unsigned long base_pfn; - unsigned long count; - unsigned long *bitmap; - struct mutex lock; -}; - -struct cma *dma_contiguous_default_area; +#include <linux/cma.h> #ifdef CONFIG_CMA_SIZE_MBYTES #define CMA_SIZE_MBYTES CONFIG_CMA_SIZE_MBYTES @@ -48,6 +34,8 @@ struct cma *dma_contiguous_default_area; #define CMA_SIZE_MBYTES 0 #endif +struct cma *dma_contiguous_default_area; + /* * Default global CMA area size can be defined in kernel's .config. * This is useful mainly for distro maintainers to create a kernel @@ -154,65 +142,6 @@ void __init dma_contiguous_reserve(phys_addr_t limit) } } -static DEFINE_MUTEX(cma_mutex); - -static int __init cma_activate_area(struct cma *cma) -{ - int bitmap_size = BITS_TO_LONGS(cma->count) * sizeof(long); - unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; - unsigned i = cma->count >> pageblock_order; - struct zone *zone; - - cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - - if (!cma->bitmap) - return -ENOMEM; - - WARN_ON_ONCE(!pfn_valid(pfn)); - zone = page_zone(pfn_to_page(pfn)); - - do { - unsigned j; - base_pfn = pfn; - for (j = pageblock_nr_pages; j; --j, pfn++) { - WARN_ON_ONCE(!pfn_valid(pfn)); - /* - * alloc_contig_range requires the pfn range - * specified to be in the same zone. Make this - * simple by forcing the entire CMA resv range - * to be in the same zone. - */ - if (page_zone(pfn_to_page(pfn)) != zone) - goto err; - } - init_cma_reserved_pageblock(pfn_to_page(base_pfn)); - } while (--i); - - mutex_init(&cma->lock); - return 0; - -err: - kfree(cma->bitmap); - return -EINVAL; -} - -static struct cma cma_areas[MAX_CMA_AREAS]; -static unsigned cma_area_count; - -static int __init cma_init_reserved_areas(void) -{ - int i; - - for (i = 0; i < cma_area_count; i++) { - int ret = cma_activate_area(&cma_areas[i]); - if (ret) - return ret; - } - - return 0; -} -core_initcall(cma_init_reserved_areas); - /** * dma_contiguous_reserve_area() - reserve custom contiguous area * @size: Size of the reserved area (in bytes), @@ -234,72 +163,17 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, phys_addr_t limit, struct cma **res_cma, bool fixed) { - struct cma *cma = &cma_areas[cma_area_count]; - phys_addr_t alignment; - int ret = 0; - - pr_debug("%s(size %lx, base %08lx, limit %08lx)\n", __func__, - (unsigned long)size, (unsigned long)base, - (unsigned long)limit); - - /* Sanity checks */ - if (cma_area_count == ARRAY_SIZE(cma_areas)) { - pr_err("Not enough slots for CMA reserved regions!\n"); - return -ENOSPC; - } - - if (!size) - return -EINVAL; - - /* Sanitise input arguments */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); - base = ALIGN(base, alignment); - size = ALIGN(size, alignment); - limit &= ~(alignment - 1); - - /* Reserve memory */ - if (base && fixed) { - if (memblock_is_region_reserved(base, size) || - memblock_reserve(base, size) < 0) { - ret = -EBUSY; - goto err; - } - } else { - phys_addr_t addr = memblock_alloc_range(size, alignment, base, - limit); - if (!addr) { - ret = -ENOMEM; - goto err; - } else { - base = addr; - } - } - - /* - * Each reserved area must be initialised later, when more kernel - * subsystems (like slab allocator) are available. - */ - cma->base_pfn = PFN_DOWN(base); - cma->count = size >> PAGE_SHIFT; - *res_cma = cma; - cma_area_count++; + int ret; - pr_info("CMA: reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, - (unsigned long)base); + ret = cma_declare_contiguous(base, size, limit, 0, 0, fixed, res_cma); + if (ret) + return ret; /* Architecture specific contiguous memory fixup. */ - dma_contiguous_early_fixup(base, size); - return 0; -err: - pr_err("CMA: failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); - return ret; -} + dma_contiguous_early_fixup(cma_get_base(*res_cma), + cma_get_size(*res_cma)); -static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) -{ - mutex_lock(&cma->lock); - bitmap_clear(cma->bitmap, pfn - cma->base_pfn, count); - mutex_unlock(&cma->lock); + return 0; } /** @@ -316,62 +190,10 @@ static void clear_cma_bitmap(struct cma *cma, unsigned long pfn, int count) struct page *dma_alloc_from_contiguous(struct device *dev, int count, unsigned int align) { - unsigned long mask, pfn, pageno, start = 0; - struct cma *cma = dev_get_cma_area(dev); - struct page *page = NULL; - int ret; - - if (!cma || !cma->count) - return NULL; - if (align > CONFIG_CMA_ALIGNMENT) align = CONFIG_CMA_ALIGNMENT; - pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, - count, align); - - if (!count) - return NULL; - - mask = (1 << align) - 1; - - - for (;;) { - mutex_lock(&cma->lock); - pageno = bitmap_find_next_zero_area(cma->bitmap, cma->count, - start, count, mask); - if (pageno >= cma->count) { - mutex_unlock(&cma->lock); - break; - } - bitmap_set(cma->bitmap, pageno, count); - /* - * It's safe to drop the lock here. We've marked this region for - * our exclusive use. If the migration fails we will take the - * lock again and unmark it. - */ - mutex_unlock(&cma->lock); - - pfn = cma->base_pfn + pageno; - mutex_lock(&cma_mutex); - ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); - mutex_unlock(&cma_mutex); - if (ret == 0) { - page = pfn_to_page(pfn); - break; - } else if (ret != -EBUSY) { - clear_cma_bitmap(cma, pfn, count); - break; - } - clear_cma_bitmap(cma, pfn, count); - pr_debug("%s(): memory range at %p is busy, retrying\n", - __func__, pfn_to_page(pfn)); - /* try again with a bit different memory target */ - start = pageno + mask + 1; - } - - pr_debug("%s(): returned %p\n", __func__, page); - return page; + return cma_alloc(dev_get_cma_area(dev), count, align); } /** @@ -387,23 +209,5 @@ struct page *dma_alloc_from_contiguous(struct device *dev, int count, bool dma_release_from_contiguous(struct device *dev, struct page *pages, int count) { - struct cma *cma = dev_get_cma_area(dev); - unsigned long pfn; - - if (!cma || !pages) - return false; - - pr_debug("%s(page %p)\n", __func__, (void *)pages); - - pfn = page_to_pfn(pages); - - if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) - return false; - - VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); - - free_contig_range(pfn, count); - clear_cma_bitmap(cma, pfn, count); - - return true; + return cma_release(dev_get_cma_area(dev), pages, count); } diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 89f752dd8465..a2e13e250bba 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -284,7 +284,7 @@ static int memory_subsys_online(struct device *dev) * attribute and need to set the online_type. */ if (mem->online_type < 0) - mem->online_type = ONLINE_KEEP; + mem->online_type = MMOP_ONLINE_KEEP; ret = memory_block_change_state(mem, MEM_ONLINE, MEM_OFFLINE); @@ -315,23 +315,23 @@ store_mem_state(struct device *dev, if (ret) return ret; - if (!strncmp(buf, "online_kernel", min_t(int, count, 13))) - online_type = ONLINE_KERNEL; - else if (!strncmp(buf, "online_movable", min_t(int, count, 14))) - online_type = ONLINE_MOVABLE; - else if (!strncmp(buf, "online", min_t(int, count, 6))) - online_type = ONLINE_KEEP; - else if (!strncmp(buf, "offline", min_t(int, count, 7))) - online_type = -1; + if (sysfs_streq(buf, "online_kernel")) + online_type = MMOP_ONLINE_KERNEL; + else if (sysfs_streq(buf, "online_movable")) + online_type = MMOP_ONLINE_MOVABLE; + else if (sysfs_streq(buf, "online")) + online_type = MMOP_ONLINE_KEEP; + else if (sysfs_streq(buf, "offline")) + online_type = MMOP_OFFLINE; else { ret = -EINVAL; goto err; } switch (online_type) { - case ONLINE_KERNEL: - case ONLINE_MOVABLE: - case ONLINE_KEEP: + case MMOP_ONLINE_KERNEL: + case MMOP_ONLINE_MOVABLE: + case MMOP_ONLINE_KEEP: /* * mem->online_type is not protected so there can be a * race here. However, when racing online, the first @@ -342,7 +342,7 @@ store_mem_state(struct device *dev, mem->online_type = online_type; ret = device_online(&mem->dev); break; - case -1: + case MMOP_OFFLINE: ret = device_offline(&mem->dev); break; default: @@ -406,7 +406,9 @@ memory_probe_store(struct device *dev, struct device_attribute *attr, int i, ret; unsigned long pages_per_block = PAGES_PER_SECTION * sections_per_block; - phys_addr = simple_strtoull(buf, NULL, 0); + ret = kstrtoull(buf, 0, &phys_addr); + if (ret) + return ret; if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) return -EINVAL; diff --git a/drivers/base/node.c b/drivers/base/node.c index 8f7ed9933a7c..c6d3ae05f1ca 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -126,7 +126,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(node_page_state(nid, NR_FILE_PAGES)), nid, K(node_page_state(nid, NR_FILE_MAPPED)), nid, K(node_page_state(nid, NR_ANON_PAGES)), - nid, K(node_page_state(nid, NR_SHMEM)), + nid, K(i.sharedram), nid, node_page_state(nid, NR_KERNEL_STACK) * THREAD_SIZE / 1024, nid, K(node_page_state(nid, NR_PAGETABLE)), diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 089e72cd37be..dfa4024c448a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -183,19 +183,32 @@ static ssize_t comp_algorithm_store(struct device *dev, static int zram_test_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - return meta->table[index].flags & BIT(flag); + return meta->table[index].value & BIT(flag); } static void zram_set_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags |= BIT(flag); + meta->table[index].value |= BIT(flag); } static void zram_clear_flag(struct zram_meta *meta, u32 index, enum zram_pageflags flag) { - meta->table[index].flags &= ~BIT(flag); + meta->table[index].value &= ~BIT(flag); +} + +static size_t zram_get_obj_size(struct zram_meta *meta, u32 index) +{ + return meta->table[index].value & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static void zram_set_obj_size(struct zram_meta *meta, + u32 index, size_t size) +{ + unsigned long flags = meta->table[index].value >> ZRAM_FLAG_SHIFT; + + meta->table[index].value = (flags << ZRAM_FLAG_SHIFT) | size; } static inline int is_partial_io(struct bio_vec *bvec) @@ -255,7 +268,6 @@ static struct zram_meta *zram_meta_alloc(u64 disksize) goto free_table; } - rwlock_init(&meta->tb_lock); return meta; free_table: @@ -304,7 +316,12 @@ static void handle_zero_page(struct bio_vec *bvec) flush_dcache_page(page); } -/* NOTE: caller should hold meta->tb_lock with write-side */ + +/* + * To protect concurrent access to the same index entry, + * caller should hold this table index entry's bit_spinlock to + * indicate this index entry is accessing. + */ static void zram_free_page(struct zram *zram, size_t index) { struct zram_meta *meta = zram->meta; @@ -324,11 +341,12 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(meta->mem_pool, handle); - atomic64_sub(meta->table[index].size, &zram->stats.compr_data_size); + atomic64_sub(zram_get_obj_size(meta, index), + &zram->stats.compr_data_size); atomic64_dec(&zram->stats.pages_stored); meta->table[index].handle = 0; - meta->table[index].size = 0; + zram_set_obj_size(meta, index, 0); } static int zram_decompress_page(struct zram *zram, char *mem, u32 index) @@ -337,14 +355,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - u16 size; + size_t size; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; - size = meta->table[index].size; + size = zram_get_obj_size(meta, index); if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); clear_page(mem); return 0; } @@ -355,7 +373,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) else ret = zcomp_decompress(zram->comp, cmem, size, mem); zs_unmap_object(meta->mem_pool, handle); - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Should NEVER happen. Return bio error if it does. */ if (unlikely(ret)) { @@ -376,14 +394,14 @@ static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, struct zram_meta *meta = zram->meta; page = bvec->bv_page; - read_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); if (unlikely(!meta->table[index].handle) || zram_test_flag(meta, index, ZRAM_ZERO)) { - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); handle_zero_page(bvec); return 0; } - read_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); if (is_partial_io(bvec)) /* Use a temporary buffer to decompress the page */ @@ -461,10 +479,10 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, if (page_zero_filled(uncmem)) { kunmap_atomic(user_mem); /* Free memory associated with this sector now. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); zram_set_flag(meta, index, ZRAM_ZERO); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.zero_pages); ret = 0; @@ -514,12 +532,12 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, * Free memory associated with this sector * before overwriting unused sectors. */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); meta->table[index].handle = handle; - meta->table[index].size = clen; - write_unlock(&zram->meta->tb_lock); + zram_set_obj_size(meta, index, clen); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); /* Update stats */ atomic64_add(clen, &zram->stats.compr_data_size); @@ -560,6 +578,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, int offset, struct bio *bio) { size_t n = bio->bi_iter.bi_size; + struct zram_meta *meta = zram->meta; /* * zram manages data in physical block size units. Because logical block @@ -580,13 +599,9 @@ static void zram_bio_discard(struct zram *zram, u32 index, } while (n >= PAGE_SIZE) { - /* - * Discard request can be large so the lock hold times could be - * lengthy. So take the lock once per page. - */ - write_lock(&zram->meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&zram->meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); index++; n -= PAGE_SIZE; } @@ -622,11 +637,18 @@ static void zram_reset_device(struct zram *zram, bool reset_capacity) memset(&zram->stats, 0, sizeof(zram->stats)); zram->disksize = 0; - if (reset_capacity) { + if (reset_capacity) set_capacity(zram->disk, 0); - revalidate_disk(zram->disk); - } + up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + if (reset_capacity) + revalidate_disk(zram->disk); } static ssize_t disksize_store(struct device *dev, @@ -666,8 +688,15 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - revalidate_disk(zram->disk); up_write(&zram->init_lock); + + /* + * Revalidate disk out of the init_lock to avoid lockdep splat. + * It's okay because disk's capacity is protected by init_lock + * so that revalidate_disk always sees up-to-date capacity. + */ + revalidate_disk(zram->disk); + return len; out_destroy_comp: @@ -807,9 +836,9 @@ static void zram_slot_free_notify(struct block_device *bdev, zram = bdev->bd_disk->private_data; meta = zram->meta; - write_lock(&meta->tb_lock); + bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); zram_free_page(zram, index); - write_unlock(&meta->tb_lock); + bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); atomic64_inc(&zram->stats.notify_free); } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 7f21c145e317..5b0afde729cd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,7 +43,6 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /*-- End of configurable params */ #define SECTOR_SHIFT 9 -#define SECTOR_SIZE (1 << SECTOR_SHIFT) #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -51,10 +50,24 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; #define ZRAM_SECTOR_PER_LOGICAL_BLOCK \ (1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT)) -/* Flags for zram pages (table[page_no].flags) */ + +/* + * The lower ZRAM_FLAG_SHIFT bits of table.value is for + * object size (excluding header), the higher bits is for + * zram_pageflags. + * + * zram is mainly used for memory efficiency so we want to keep memory + * footprint small so we can squeeze size and flags into a field. + * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), + * the higher bits is for zram_pageflags. + */ +#define ZRAM_FLAG_SHIFT 24 + +/* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO, + ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, + ZRAM_ACCESS, /* page in now accessed */ __NR_ZRAM_PAGEFLAGS, }; @@ -62,11 +75,10 @@ enum zram_pageflags { /*-- Data structures */ /* Allocated for each disk page */ -struct table { +struct zram_table_entry { unsigned long handle; - u16 size; /* object size (excluding header) */ - u8 flags; -} __aligned(4); + unsigned long value; +}; struct zram_stats { atomic64_t compr_data_size; /* compressed size of pages stored */ @@ -81,8 +93,7 @@ struct zram_stats { }; struct zram_meta { - rwlock_t tb_lock; /* protect table */ - struct table *table; + struct zram_table_entry *table; struct zs_pool *mem_pool; }; diff --git a/drivers/firmware/efi/runtime-map.c b/drivers/firmware/efi/runtime-map.c index 97cdd16a2169..018c29a26615 100644 --- a/drivers/firmware/efi/runtime-map.c +++ b/drivers/firmware/efi/runtime-map.c @@ -138,6 +138,27 @@ add_sysfs_runtime_map_entry(struct kobject *kobj, int nr) return entry; } +int efi_get_runtime_map_size(void) +{ + return nr_efi_runtime_map * efi_memdesc_size; +} + +int efi_get_runtime_map_desc_size(void) +{ + return efi_memdesc_size; +} + +int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + size_t sz = efi_get_runtime_map_size(); + + if (sz > bufsz) + sz = bufsz; + + memcpy(buf, efi_runtime_map, sz); + return 0; +} + void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) { efi_runtime_map = map; diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index 17cf96c45f2b..79f18e6d9c4f 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -286,7 +286,11 @@ int __meminit firmware_map_add_hotplug(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = firmware_map_find_entry_bootmem(start, end, type); + entry = firmware_map_find_entry(start, end - 1, type); + if (entry) + return 0; + + entry = firmware_map_find_entry_bootmem(start, end - 1, type); if (!entry) { entry = kzalloc(sizeof(struct firmware_map_entry), GFP_ATOMIC); if (!entry) diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index 7e4bae760e27..c3b80fd65d62 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -125,7 +125,7 @@ int drm_ht_insert_item(struct drm_open_hash *ht, struct drm_hash_item *item) parent = &entry->head; } if (parent) { - hlist_add_after_rcu(parent, &item->head); + hlist_add_behind_rcu(&item->head, parent); } else { hlist_add_head_rcu(&item->head, h_list); } diff --git a/drivers/hwmon/asus_atk0110.c b/drivers/hwmon/asus_atk0110.c index ae208f612198..cccef87963e0 100644 --- a/drivers/hwmon/asus_atk0110.c +++ b/drivers/hwmon/asus_atk0110.c @@ -688,7 +688,7 @@ static int atk_debugfs_gitm_get(void *p, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(atk_debugfs_gitm, atk_debugfs_gitm_get, NULL, - "0x%08llx\n") + "0x%08llx\n"); static int atk_acpi_print(char *buf, size_t sz, union acpi_object *obj) { diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig index a11ff74a5127..9eac8de9e8b7 100644 --- a/drivers/input/Kconfig +++ b/drivers/input/Kconfig @@ -178,6 +178,15 @@ comment "Input Device Drivers" source "drivers/input/keyboard/Kconfig" +config INPUT_LEDS + bool "LED Support" + depends on LEDS_CLASS = INPUT || LEDS_CLASS = y + select LEDS_TRIGGERS + default y + help + This option enables support for LEDs on keyboards managed + by the input layer. + source "drivers/input/mouse/Kconfig" source "drivers/input/joystick/Kconfig" diff --git a/drivers/input/Makefile b/drivers/input/Makefile index 5ca3f631497f..2ab5f3336da5 100644 --- a/drivers/input/Makefile +++ b/drivers/input/Makefile @@ -6,6 +6,9 @@ obj-$(CONFIG_INPUT) += input-core.o input-core-y := input.o input-compat.o input-mt.o ff-core.o +ifeq ($(CONFIG_INPUT_LEDS),y) +input-core-y += leds.o +endif obj-$(CONFIG_INPUT_FF_MEMLESS) += ff-memless.o obj-$(CONFIG_INPUT_POLLDEV) += input-polldev.o diff --git a/drivers/input/input.c b/drivers/input/input.c index 1c4c0db05550..3b9284b18e70 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -708,6 +708,9 @@ static void input_disconnect_device(struct input_dev *dev) handle->open = 0; spin_unlock_irq(&dev->event_lock); + + if (is_event_supported(EV_LED, dev->evbit, EV_MAX)) + input_led_disconnect(dev); } /** @@ -2134,6 +2137,9 @@ int input_register_device(struct input_dev *dev) list_add_tail(&dev->node, &input_dev_list); + if (is_event_supported(EV_LED, dev->evbit, EV_MAX)) + input_led_connect(dev); + list_for_each_entry(handler, &input_handler_list, node) input_attach_handler(dev, handler); diff --git a/drivers/input/leds.c b/drivers/input/leds.c new file mode 100644 index 000000000000..985fa7ebeec7 --- /dev/null +++ b/drivers/input/leds.c @@ -0,0 +1,249 @@ +/* + * LED support for the input layer + * + * Copyright 2010-2014 Samuel Thibault <samuel.thibault@ens-lyon.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/leds.h> +#include <linux/input.h> + +/* + * Keyboard LEDs are propagated by default like the following example: + * + * VT keyboard numlock trigger + * -> vt::numl VT LED + * -> vt-numl VT trigger + * -> per-device inputX::numl LED + * + * Userland can however choose the trigger for the vt::numl LED, or + * independently choose the trigger for any inputx::numl LED. + * + * + * VT LED classes and triggers are registered on-demand according to + * existing LED devices + */ + +/* Handler for VT LEDs, just triggers the corresponding VT trigger. */ +static void vt_led_set(struct led_classdev *cdev, + enum led_brightness brightness); +static struct led_classdev vt_leds[LED_CNT] = { +#define DEFINE_INPUT_LED(vt_led, nam, deftrig) \ + [vt_led] = { \ + .name = "vt::"nam, \ + .max_brightness = 1, \ + .brightness_set = vt_led_set, \ + .default_trigger = deftrig, \ + } +/* Default triggers for the VT LEDs just correspond to the legacy + * usage. */ + DEFINE_INPUT_LED(LED_NUML, "numl", "kbd-numlock"), + DEFINE_INPUT_LED(LED_CAPSL, "capsl", "kbd-capslock"), + DEFINE_INPUT_LED(LED_SCROLLL, "scrolll", "kbd-scrollock"), + DEFINE_INPUT_LED(LED_COMPOSE, "compose", NULL), + DEFINE_INPUT_LED(LED_KANA, "kana", "kbd-kanalock"), + DEFINE_INPUT_LED(LED_SLEEP, "sleep", NULL), + DEFINE_INPUT_LED(LED_SUSPEND, "suspend", NULL), + DEFINE_INPUT_LED(LED_MUTE, "mute", NULL), + DEFINE_INPUT_LED(LED_MISC, "misc", NULL), + DEFINE_INPUT_LED(LED_MAIL, "mail", NULL), + DEFINE_INPUT_LED(LED_CHARGING, "charging", NULL), +}; +static const char *const vt_led_names[LED_CNT] = { + [LED_NUML] = "numl", + [LED_CAPSL] = "capsl", + [LED_SCROLLL] = "scrolll", + [LED_COMPOSE] = "compose", + [LED_KANA] = "kana", + [LED_SLEEP] = "sleep", + [LED_SUSPEND] = "suspend", + [LED_MUTE] = "mute", + [LED_MISC] = "misc", + [LED_MAIL] = "mail", + [LED_CHARGING] = "charging", +}; +/* Handler for hotplug initialization */ +static void vt_led_trigger_activate(struct led_classdev *cdev); +/* VT triggers */ +static struct led_trigger vt_led_triggers[LED_CNT] = { +#define DEFINE_INPUT_LED_TRIGGER(vt_led, nam) \ + [vt_led] = { \ + .name = "vt-"nam, \ + .activate = vt_led_trigger_activate, \ + } + DEFINE_INPUT_LED_TRIGGER(LED_NUML, "numl"), + DEFINE_INPUT_LED_TRIGGER(LED_CAPSL, "capsl"), + DEFINE_INPUT_LED_TRIGGER(LED_SCROLLL, "scrolll"), + DEFINE_INPUT_LED_TRIGGER(LED_COMPOSE, "compose"), + DEFINE_INPUT_LED_TRIGGER(LED_KANA, "kana"), + DEFINE_INPUT_LED_TRIGGER(LED_SLEEP, "sleep"), + DEFINE_INPUT_LED_TRIGGER(LED_SUSPEND, "suspend"), + DEFINE_INPUT_LED_TRIGGER(LED_MUTE, "mute"), + DEFINE_INPUT_LED_TRIGGER(LED_MISC, "misc"), + DEFINE_INPUT_LED_TRIGGER(LED_MAIL, "mail"), + DEFINE_INPUT_LED_TRIGGER(LED_CHARGING, "charging"), +}; + +/* Lock for registration coherency */ +static DEFINE_MUTEX(vt_led_registered_lock); + +/* Which VT LED classes and triggers are registered */ +static unsigned long vt_led_registered[BITS_TO_LONGS(LED_CNT)]; + +/* Number of input devices having each LED */ +static int vt_led_references[LED_CNT]; + +/* VT LED state change, tell the VT trigger. */ +static void vt_led_set(struct led_classdev *cdev, + enum led_brightness brightness) +{ + int led = cdev - vt_leds; + + led_trigger_event(&vt_led_triggers[led], !!brightness); +} + +/* LED state change for some keyboard, notify that keyboard. */ +static void perdevice_input_led_set(struct led_classdev *cdev, + enum led_brightness brightness) +{ + struct input_dev *dev; + struct led_classdev *leds; + int led; + + dev = cdev->dev->platform_data; + if (!dev) + /* Still initializing */ + return; + leds = dev->leds; + led = cdev - leds; + + input_event(dev, EV_LED, led, !!brightness); + input_event(dev, EV_SYN, SYN_REPORT, 0); +} + +/* Keyboard hotplug, initialize its LED status */ +static void vt_led_trigger_activate(struct led_classdev *cdev) +{ + struct led_trigger *trigger = cdev->trigger; + int led = trigger - vt_led_triggers; + + if (cdev->brightness_set) + cdev->brightness_set(cdev, vt_leds[led].brightness); +} + +/* Free led stuff from input device, used at abortion and disconnection. */ +static void input_led_delete(struct input_dev *dev) +{ + if (dev) { + struct led_classdev *leds = dev->leds; + if (leds) { + int i; + for (i = 0; i < LED_CNT; i++) + kfree(leds[i].name); + kfree(leds); + dev->leds = NULL; + } + } +} + +/* A new input device with potential LEDs to connect. */ +int input_led_connect(struct input_dev *dev) +{ + int i, error = 0; + struct led_classdev *leds; + + dev->leds = leds = kcalloc(LED_CNT, sizeof(*leds), GFP_KERNEL); + if (!dev->leds) + return -ENOMEM; + + /* lazily register missing VT LEDs */ + mutex_lock(&vt_led_registered_lock); + for (i = 0; i < LED_CNT; i++) + if (vt_leds[i].name && test_bit(i, dev->ledbit)) { + if (!vt_led_references[i]) { + led_trigger_register(&vt_led_triggers[i]); + /* This keyboard is first to have led i, + * try to register it */ + if (!led_classdev_register(NULL, &vt_leds[i])) + vt_led_references[i] = 1; + else + led_trigger_unregister(&vt_led_triggers[i]); + } else + vt_led_references[i]++; + } + mutex_unlock(&vt_led_registered_lock); + + /* and register this device's LEDs */ + for (i = 0; i < LED_CNT; i++) + if (vt_leds[i].name && test_bit(i, dev->ledbit)) { + leds[i].name = kasprintf(GFP_KERNEL, "%s::%s", + dev_name(&dev->dev), + vt_led_names[i]); + if (!leds[i].name) { + error = -ENOMEM; + goto err; + } + leds[i].max_brightness = 1; + leds[i].brightness_set = perdevice_input_led_set; + leds[i].default_trigger = vt_led_triggers[i].name; + } + + /* No issue so far, we can register for real. */ + for (i = 0; i < LED_CNT; i++) + if (leds[i].name) { + led_classdev_register(&dev->dev, &leds[i]); + leds[i].dev->platform_data = dev; + perdevice_input_led_set(&leds[i], + vt_leds[i].brightness); + } + + return 0; + +err: + input_led_delete(dev); + return error; +} + +/* + * Disconnected input device. Clean it, and deregister now-useless VT LEDs + * and triggers. + */ +void input_led_disconnect(struct input_dev *dev) +{ + int i; + struct led_classdev *leds = dev->leds; + + for (i = 0; i < LED_CNT; i++) + if (leds[i].name) + led_classdev_unregister(&leds[i]); + + input_led_delete(dev); + + mutex_lock(&vt_led_registered_lock); + for (i = 0; i < LED_CNT; i++) { + if (!vt_leds[i].name || !test_bit(i, dev->ledbit)) + continue; + + vt_led_references[i]--; + if (vt_led_references[i]) { + /* Still some devices needing it */ + continue; + } + + led_classdev_unregister(&vt_leds[i]); + led_trigger_unregister(&vt_led_triggers[i]); + clear_bit(i, vt_led_registered); + } + mutex_unlock(&vt_led_registered_lock); +} + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("User LED support for input layer"); +MODULE_AUTHOR("Samuel Thibault <samuel.thibault@ens-lyon.org>"); diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 8736f69262f9..27cf0cdcdaab 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -11,9 +11,6 @@ menuconfig NEW_LEDS Say Y to enable Linux LED support. This allows control of supported LEDs from both userspace and optionally, by kernel events (triggers). - This is not related to standard keyboard LEDs which are controlled - via the input system. - if NEW_LEDS config LEDS_CLASS diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index 0bf1e4edf04d..6590558d1d31 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -42,7 +42,6 @@ DEFINE_MUTEX(lguest_lock); static __init int map_switcher(void) { int i, err; - struct page **pagep; /* * Map the Switcher in to high memory. @@ -110,11 +109,9 @@ static __init int map_switcher(void) * This code actually sets up the pages we've allocated to appear at * switcher_addr. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our - * array of struct pages. It increments that pointer, but we don't - * care. + * array of struct pages. */ - pagep = lg_switcher_pages; - err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); + err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages); if (err) { printk("lguest: map_vm_area failed: %i\n", err); goto free_vma; diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c index 1972d57aadb3..e7fbc08a0627 100644 --- a/drivers/misc/ti-st/st_core.c +++ b/drivers/misc/ti-st/st_core.c @@ -342,7 +342,7 @@ void st_int_recv(void *disc_data, /* Unknow packet? */ default: type = *ptr; - if (st_gdata->list[type] == NULL) { + if (type >= ST_MAX_CHANNELS || st_gdata->list[type] == NULL) { pr_err("chip/interface misbehavior dropping" " frame starting with 0x%02x", type); goto done; diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index 3abd3cbab75f..053e8506fd97 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -1853,7 +1853,7 @@ static int i40e_update_ethtool_fdir_entry(struct i40e_vsi *vsi, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &pf->fdir_filter_list); diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c index a452730a3278..a6e5bcc12c8f 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c @@ -2518,7 +2518,7 @@ static int ixgbe_update_ethtool_fdir_entry(struct ixgbe_adapter *adapter, /* add filter to the list */ if (parent) - hlist_add_after(&parent->fdir_node, &input->fdir_node); + hlist_add_behind(&input->fdir_node, &parent->fdir_node); else hlist_add_head(&input->fdir_node, &adapter->fdir_filter_list); diff --git a/drivers/net/irda/donauboe.c b/drivers/net/irda/donauboe.c index 768dfe9a9315..6d3e2093bf7f 100644 --- a/drivers/net/irda/donauboe.c +++ b/drivers/net/irda/donauboe.c @@ -1755,17 +1755,4 @@ static struct pci_driver donauboe_pci_driver = { .resume = toshoboe_wakeup }; -static int __init -donauboe_init (void) -{ - return pci_register_driver(&donauboe_pci_driver); -} - -static void __exit -donauboe_cleanup (void) -{ - pci_unregister_driver(&donauboe_pci_driver); -} - -module_init(donauboe_init); -module_exit(donauboe_cleanup); +module_pci_driver(donauboe_pci_driver); diff --git a/drivers/parport/parport_ip32.c b/drivers/parport/parport_ip32.c index c864f82bd37d..30e981be14c2 100644 --- a/drivers/parport/parport_ip32.c +++ b/drivers/parport/parport_ip32.c @@ -2204,7 +2204,7 @@ static int __init parport_ip32_init(void) { pr_info(PPIP32 "SGI IP32 built-in parallel port driver v0.6\n"); this_port = parport_ip32_probe_port(); - return IS_ERR(this_port) ? PTR_ERR(this_port) : 0; + return PTR_ERR_OR_ZERO(this_port); } /** diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 0754f5c7cb3b..83743a1a6731 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -760,6 +760,15 @@ config RTC_DRV_DS1742 This driver can also be built as a module. If so, the module will be called rtc-ds1742. +config RTC_DRV_DS2404 + tristate "Maxim/Dallas DS2404" + help + If you say yes here you get support for the + Dallas DS2404 RTC chip. + + This driver can also be built as a module. If so, the module + will be called rtc-ds2404. + config RTC_DRV_DA9052 tristate "Dialog DA9052/DA9053 RTC" depends on PMIC_DA9052 @@ -789,7 +798,7 @@ config RTC_DRV_DA9063 config RTC_DRV_EFI tristate "EFI RTC" - depends on IA64 + depends on EFI help If you say yes here you will get support for the EFI Real Time Clock. @@ -873,15 +882,6 @@ config RTC_DRV_V3020 This driver can also be built as a module. If so, the module will be called rtc-v3020. -config RTC_DRV_DS2404 - tristate "Dallas DS2404" - help - If you say yes here you get support for the - Dallas DS2404 RTC chip. - - This driver can also be built as a module. If so, the module - will be called rtc-ds2404. - config RTC_DRV_WM831X tristate "Wolfson Microelectronics WM831x RTC" depends on MFD_WM831X @@ -1349,6 +1349,7 @@ config RTC_DRV_SIRFSOC config RTC_DRV_MOXART tristate "MOXA ART RTC" + depends on ARCH_MOXART || COMPILE_TEST help If you say yes here you get support for the MOXA ART RTC module. diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index 70347d041d10..f1dfc3648ee5 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -10,6 +10,10 @@ obj-$(CONFIG_RTC_SYSTOHC) += systohc.o obj-$(CONFIG_RTC_CLASS) += rtc-core.o rtc-core-y := class.o interface.o +ifdef CONFIG_RTC_DRV_EFI +rtc-core-y += rtc-efi-platform.o +endif + rtc-core-$(CONFIG_RTC_INTF_DEV) += rtc-dev.o rtc-core-$(CONFIG_RTC_INTF_PROC) += rtc-proc.o rtc-core-$(CONFIG_RTC_INTF_SYSFS) += rtc-sysfs.o diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 5813fa52c3d4..5b2717f5dafa 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -348,6 +348,8 @@ static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) /* Make sure we're not setting alarms in the past */ err = __rtc_read_time(rtc, &tm); + if (err) + return err; rtc_tm_to_time(&tm, &now); if (scheduled <= now) return -ETIME; diff --git a/drivers/rtc/rtc-ds1343.c b/drivers/rtc/rtc-ds1343.c index c3719189dd96..ae9f997223b1 100644 --- a/drivers/rtc/rtc-ds1343.c +++ b/drivers/rtc/rtc-ds1343.c @@ -4,6 +4,7 @@ * Real Time Clock * * Author : Raghavendra Chandra Ganiga <ravi23ganiga@gmail.com> + * Ankur Srivastava <sankurece@gmail.com> : DS1343 Nvram Support * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -45,6 +46,9 @@ #define DS1343_CONTROL_REG 0x0F #define DS1343_STATUS_REG 0x10 #define DS1343_TRICKLE_REG 0x11 +#define DS1343_NVRAM 0x20 + +#define DS1343_NVRAM_LEN 96 /* DS1343 Control Registers bits */ #define DS1343_EOSC 0x80 @@ -149,6 +153,64 @@ static ssize_t ds1343_store_glitchfilter(struct device *dev, static DEVICE_ATTR(glitch_filter, S_IRUGO | S_IWUSR, ds1343_show_glitchfilter, ds1343_store_glitchfilter); +static ssize_t ds1343_nvram_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + int ret; + unsigned char address; + struct device *dev = kobj_to_dev(kobj); + struct ds1343_priv *priv = dev_get_drvdata(dev); + + if (unlikely(!count)) + return count; + + if ((count + off) > DS1343_NVRAM_LEN) + count = DS1343_NVRAM_LEN - off; + + address = DS1343_NVRAM + off; + + ret = regmap_bulk_write(priv->map, address, buf, count); + if (ret < 0) + dev_err(&priv->spi->dev, "Error in nvram write %d", ret); + + return (ret < 0) ? ret : count; +} + + +static ssize_t ds1343_nvram_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *attr, + char *buf, loff_t off, size_t count) +{ + int ret; + unsigned char address; + struct device *dev = kobj_to_dev(kobj); + struct ds1343_priv *priv = dev_get_drvdata(dev); + + if (unlikely(!count)) + return count; + + if ((count + off) > DS1343_NVRAM_LEN) + count = DS1343_NVRAM_LEN - off; + + address = DS1343_NVRAM + off; + + ret = regmap_bulk_read(priv->map, address, buf, count); + if (ret < 0) + dev_err(&priv->spi->dev, "Error in nvram read %d\n", ret); + + return (ret < 0) ? ret : count; +} + + +static struct bin_attribute nvram_attr = { + .attr.name = "nvram", + .attr.mode = S_IRUGO | S_IWUSR, + .read = ds1343_nvram_read, + .write = ds1343_nvram_write, + .size = DS1343_NVRAM_LEN, +}; + static ssize_t ds1343_show_alarmstatus(struct device *dev, struct device_attribute *attr, char *buf) { @@ -274,12 +336,16 @@ static int ds1343_sysfs_register(struct device *dev) if (err) goto error1; + err = device_create_bin_file(dev, &nvram_attr); + if (err) + goto error2; + if (priv->irq <= 0) return err; err = device_create_file(dev, &dev_attr_alarm_mode); if (err) - goto error2; + goto error3; err = device_create_file(dev, &dev_attr_alarm_status); if (!err) @@ -287,6 +353,9 @@ static int ds1343_sysfs_register(struct device *dev) device_remove_file(dev, &dev_attr_alarm_mode); +error3: + device_remove_bin_file(dev, &nvram_attr); + error2: device_remove_file(dev, &dev_attr_trickle_charger); @@ -302,6 +371,7 @@ static void ds1343_sysfs_unregister(struct device *dev) device_remove_file(dev, &dev_attr_glitch_filter); device_remove_file(dev, &dev_attr_trickle_charger); + device_remove_bin_file(dev, &nvram_attr); if (priv->irq <= 0) return; @@ -684,6 +754,7 @@ static struct spi_driver ds1343_driver = { module_spi_driver(ds1343_driver); MODULE_DESCRIPTION("DS1343 RTC SPI Driver"); -MODULE_AUTHOR("Raghavendra Chandra Ganiga <ravi23ganiga@gmail.com>"); +MODULE_AUTHOR("Raghavendra Chandra Ganiga <ravi23ganiga@gmail.com>," + "Ankur Srivastava <sankurece@gmail.com>"); MODULE_LICENSE("GPL v2"); MODULE_VERSION(DS1343_DRV_VERSION); diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index c6b2191a4128..9822715db8ba 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -231,7 +231,7 @@ static struct platform_driver ds1742_rtc_driver = { .driver = { .name = "rtc-ds1742", .owner = THIS_MODULE, - .of_match_table = ds1742_rtc_of_match, + .of_match_table = of_match_ptr(ds1742_rtc_of_match), }, }; diff --git a/drivers/rtc/rtc-efi-platform.c b/drivers/rtc/rtc-efi-platform.c new file mode 100644 index 000000000000..1a7f890bff5b --- /dev/null +++ b/drivers/rtc/rtc-efi-platform.c @@ -0,0 +1,30 @@ +/* + * Moved from arch/ia64/kernel/time.c + * + * Copyright (C) 1998-2003 Hewlett-Packard Co + * Stephane Eranian <eranian@hpl.hp.com> + * David Mosberger <davidm@hpl.hp.com> + * Copyright (C) 1999 Don Dugger <don.dugger@intel.com> + * Copyright (C) 1999-2000 VA Linux Systems + * Copyright (C) 1999-2000 Walt Drummond <drummond@valinux.com> + */ +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/efi.h> +#include <linux/platform_device.h> + +static struct platform_device rtc_efi_dev = { + .name = "rtc-efi", + .id = -1, +}; + +static int __init rtc_init(void) +{ + if (platform_device_register(&rtc_efi_dev) < 0) + pr_err("unable to register rtc device...\n"); + + /* not necessarily an error */ + return 0; +} +module_init(rtc_init); diff --git a/drivers/rtc/rtc-efi.c b/drivers/rtc/rtc-efi.c index c4c38431012e..8225b89de810 100644 --- a/drivers/rtc/rtc-efi.c +++ b/drivers/rtc/rtc-efi.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/module.h> +#include <linux/stringify.h> #include <linux/time.h> #include <linux/platform_device.h> #include <linux/rtc.h> @@ -48,8 +49,8 @@ compute_wday(efi_time_t *eft) int y; int ndays = 0; - if (eft->year < 1998) { - pr_err("EFI year < 1998, invalid date\n"); + if (eft->year < EFI_RTC_EPOCH) { + pr_err("EFI year < " __stringify(EFI_RTC_EPOCH) ", invalid date\n"); return -1; } @@ -78,19 +79,36 @@ convert_to_efi_time(struct rtc_time *wtime, efi_time_t *eft) eft->timezone = EFI_UNSPECIFIED_TIMEZONE; } -static void +static bool convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) { memset(wtime, 0, sizeof(*wtime)); + + if (eft->second >= 60) + return false; wtime->tm_sec = eft->second; + + if (eft->minute >= 60) + return false; wtime->tm_min = eft->minute; + + if (eft->hour >= 24) + return false; wtime->tm_hour = eft->hour; + + if (!eft->day || eft->day > 31) + return false; wtime->tm_mday = eft->day; + + if (!eft->month || eft->month > 12) + return false; wtime->tm_mon = eft->month - 1; wtime->tm_year = eft->year - 1900; /* day of the week [0-6], Sunday=0 */ wtime->tm_wday = compute_wday(eft); + if (wtime->tm_wday < 0) + return false; /* day in the year [1-365]*/ wtime->tm_yday = compute_yday(eft); @@ -106,6 +124,8 @@ convert_from_efi_time(efi_time_t *eft, struct rtc_time *wtime) default: wtime->tm_isdst = -1; } + + return true; } static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) @@ -122,7 +142,8 @@ static int efi_read_alarm(struct device *dev, struct rtc_wkalrm *wkalrm) if (status != EFI_SUCCESS) return -EINVAL; - convert_from_efi_time(&eft, &wkalrm->time); + if (!convert_from_efi_time(&eft, &wkalrm->time)) + return -EIO; return rtc_valid_tm(&wkalrm->time); } @@ -163,7 +184,8 @@ static int efi_read_time(struct device *dev, struct rtc_time *tm) return -EINVAL; } - convert_from_efi_time(&eft, tm); + if (!convert_from_efi_time(&eft, tm)) + return -EIO; return rtc_valid_tm(tm); } diff --git a/drivers/staging/android/binder.c b/drivers/staging/android/binder.c index 02b0379ae550..4f34dc0095b5 100644 --- a/drivers/staging/android/binder.c +++ b/drivers/staging/android/binder.c @@ -585,7 +585,6 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; - struct page **page_array_ptr; page = &proc->pages[(page_addr - proc->buffer) / PAGE_SIZE]; @@ -598,8 +597,7 @@ static int binder_update_page_range(struct binder_proc *proc, int allocate, } tmp_area.addr = page_addr; tmp_area.size = PAGE_SIZE + PAGE_SIZE /* guard page? */; - page_array_ptr = page; - ret = map_vm_area(&tmp_area, PAGE_KERNEL, &page_array_ptr); + ret = map_vm_area(&tmp_area, PAGE_KERNEL, page); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %p in kernel\n", proc->pid, page_addr); diff --git a/drivers/staging/lustre/lustre/libcfs/hash.c b/drivers/staging/lustre/lustre/libcfs/hash.c index 5dde79418297..8ef1deb59d4a 100644 --- a/drivers/staging/lustre/lustre/libcfs/hash.c +++ b/drivers/staging/lustre/lustre/libcfs/hash.c @@ -351,7 +351,7 @@ cfs_hash_dh_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_t, dh_head); if (dh->dh_tail != NULL) /* not empty */ - hlist_add_after(dh->dh_tail, hnode); + hlist_add_behind(hnode, dh->dh_tail); else /* empty list */ hlist_add_head(hnode, &dh->dh_head); dh->dh_tail = hnode; @@ -406,7 +406,7 @@ cfs_hash_dd_hnode_add(struct cfs_hash *hs, struct cfs_hash_bd *bd, cfs_hash_dhead_dep_t, dd_head); if (dh->dd_tail != NULL) /* not empty */ - hlist_add_after(dh->dd_tail, hnode); + hlist_add_behind(hnode, dh->dd_tail); else /* empty list */ hlist_add_head(hnode, &dh->dd_head); dh->dd_tail = hnode; diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig index b24aa010f68c..65cd80bf9aec 100644 --- a/drivers/tty/Kconfig +++ b/drivers/tty/Kconfig @@ -13,6 +13,10 @@ config VT bool "Virtual terminal" if EXPERT depends on !S390 && !UML select INPUT + select NEW_LEDS + select LEDS_CLASS + select LEDS_TRIGGERS + select INPUT_LEDS default y ---help--- If you say Y here, you will get support for terminal devices with diff --git a/drivers/tty/vt/keyboard.c b/drivers/tty/vt/keyboard.c index d0e3a4497707..d6ecfc9e734f 100644 --- a/drivers/tty/vt/keyboard.c +++ b/drivers/tty/vt/keyboard.c @@ -33,6 +33,7 @@ #include <linux/string.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/leds.h> #include <linux/kbd_kern.h> #include <linux/kbd_diacr.h> @@ -130,6 +131,7 @@ static char rep; /* flag telling character repeat */ static int shift_state = 0; static unsigned char ledstate = 0xff; /* undefined */ +static unsigned char lockstate = 0xff; /* undefined */ static unsigned char ledioctl; /* @@ -961,6 +963,41 @@ static void k_brl(struct vc_data *vc, unsigned char value, char up_flag) } } +/* We route VT keyboard "leds" through triggers */ +static void kbd_ledstate_trigger_activate(struct led_classdev *cdev); + +static struct led_trigger ledtrig_ledstate[] = { +#define DEFINE_LEDSTATE_TRIGGER(kbd_led, nam) \ + [kbd_led] = { \ + .name = nam, \ + .activate = kbd_ledstate_trigger_activate, \ + } + DEFINE_LEDSTATE_TRIGGER(VC_SCROLLOCK, "kbd-scrollock"), + DEFINE_LEDSTATE_TRIGGER(VC_NUMLOCK, "kbd-numlock"), + DEFINE_LEDSTATE_TRIGGER(VC_CAPSLOCK, "kbd-capslock"), + DEFINE_LEDSTATE_TRIGGER(VC_KANALOCK, "kbd-kanalock"), +#undef DEFINE_LEDSTATE_TRIGGER +}; + +static void kbd_lockstate_trigger_activate(struct led_classdev *cdev); + +static struct led_trigger ledtrig_lockstate[] = { +#define DEFINE_LOCKSTATE_TRIGGER(kbd_led, nam) \ + [kbd_led] = { \ + .name = nam, \ + .activate = kbd_lockstate_trigger_activate, \ + } + DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLOCK, "kbd-shiftlock"), + DEFINE_LOCKSTATE_TRIGGER(VC_ALTGRLOCK, "kbd-altgrlock"), + DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLOCK, "kbd-ctrllock"), + DEFINE_LOCKSTATE_TRIGGER(VC_ALTLOCK, "kbd-altlock"), + DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTLLOCK, "kbd-shiftllock"), + DEFINE_LOCKSTATE_TRIGGER(VC_SHIFTRLOCK, "kbd-shiftrlock"), + DEFINE_LOCKSTATE_TRIGGER(VC_CTRLLLOCK, "kbd-ctrlllock"), + DEFINE_LOCKSTATE_TRIGGER(VC_CTRLRLOCK, "kbd-ctrlrlock"), +#undef DEFINE_LOCKSTATE_TRIGGER +}; + /* * The leds display either (i) the status of NumLock, CapsLock, ScrollLock, * or (ii) whatever pattern of lights people want to show using KDSETLED, @@ -995,18 +1032,25 @@ static inline unsigned char getleds(void) return kbd->ledflagstate; } -static int kbd_update_leds_helper(struct input_handle *handle, void *data) +/* Called on trigger connection, to set initial state */ +static void kbd_ledstate_trigger_activate(struct led_classdev *cdev) { - unsigned char leds = *(unsigned char *)data; + struct led_trigger *trigger = cdev->trigger; + int led = trigger - ledtrig_ledstate; - if (test_bit(EV_LED, handle->dev->evbit)) { - input_inject_event(handle, EV_LED, LED_SCROLLL, !!(leds & 0x01)); - input_inject_event(handle, EV_LED, LED_NUML, !!(leds & 0x02)); - input_inject_event(handle, EV_LED, LED_CAPSL, !!(leds & 0x04)); - input_inject_event(handle, EV_SYN, SYN_REPORT, 0); - } + tasklet_disable(&keyboard_tasklet); + led_trigger_event(trigger, ledstate & (1 << led) ? LED_FULL : LED_OFF); + tasklet_enable(&keyboard_tasklet); +} - return 0; +static void kbd_lockstate_trigger_activate(struct led_classdev *cdev) +{ + struct led_trigger *trigger = cdev->trigger; + int led = trigger - ledtrig_lockstate; + + tasklet_disable(&keyboard_tasklet); + led_trigger_event(trigger, lockstate & (1 << led) ? LED_FULL : LED_OFF); + tasklet_enable(&keyboard_tasklet); } /** @@ -1095,16 +1139,29 @@ static void kbd_bh(unsigned long dummy) { unsigned char leds; unsigned long flags; - + int i; + spin_lock_irqsave(&led_lock, flags); leds = getleds(); spin_unlock_irqrestore(&led_lock, flags); if (leds != ledstate) { - input_handler_for_each_handle(&kbd_handler, &leds, - kbd_update_leds_helper); + for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++) + if ((leds ^ ledstate) & (1 << i)) + led_trigger_event(&ledtrig_ledstate[i], + leds & (1 << i) + ? LED_FULL : LED_OFF); ledstate = leds; } + + if (kbd->lockstate != lockstate) { + for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++) + if ((kbd->lockstate ^ lockstate) & (1 << i)) + led_trigger_event(&ledtrig_lockstate[i], + kbd->lockstate & (1 << i) + ? LED_FULL : LED_OFF); + lockstate = kbd->lockstate; + } } DECLARE_TASKLET_DISABLED(keyboard_tasklet, kbd_bh, 0); @@ -1442,20 +1499,6 @@ static void kbd_disconnect(struct input_handle *handle) kfree(handle); } -/* - * Start keyboard handler on the new keyboard by refreshing LED state to - * match the rest of the system. - */ -static void kbd_start(struct input_handle *handle) -{ - tasklet_disable(&keyboard_tasklet); - - if (ledstate != 0xff) - kbd_update_leds_helper(handle, &ledstate); - - tasklet_enable(&keyboard_tasklet); -} - static const struct input_device_id kbd_ids[] = { { .flags = INPUT_DEVICE_ID_MATCH_EVBIT, @@ -1477,7 +1520,6 @@ static struct input_handler kbd_handler = { .match = kbd_match, .connect = kbd_connect, .disconnect = kbd_disconnect, - .start = kbd_start, .name = "kbd", .id_table = kbd_ids, }; @@ -1501,6 +1543,20 @@ int __init kbd_init(void) if (error) return error; + for (i = 0; i < ARRAY_SIZE(ledtrig_ledstate); i++) { + error = led_trigger_register(&ledtrig_ledstate[i]); + if (error) + pr_err("error %d while registering trigger %s\n", + error, ledtrig_ledstate[i].name); + } + + for (i = 0; i < ARRAY_SIZE(ledtrig_lockstate); i++) { + error = led_trigger_register(&ledtrig_lockstate[i]); + if (error) + pr_err("error %d while registering trigger %s\n", + error, ledtrig_lockstate[i].name); + } + tasklet_enable(&keyboard_tasklet); tasklet_schedule(&keyboard_tasklet); diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 428089009cd5..19b170d9e570 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -190,8 +190,6 @@ static ssize_t brightness_store(struct device *dev, } mutex_unlock(&bd->ops_lock); - backlight_generate_event(bd, BACKLIGHT_UPDATE_SYSFS); - return rc; } static DEVICE_ATTR_RW(brightness); diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index c770337c4b45..24575d9d882d 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -153,6 +153,7 @@ extern int adfs_map_lookup(struct super_block *sb, unsigned int frag_id, unsigne extern unsigned int adfs_map_free(struct super_block *sb); /* Misc */ +__printf(3, 4) void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...); #define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 0d138c0de293..51c279a29845 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -138,7 +138,7 @@ adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_inf goto out; if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, "parent directory changed under me! (%lx but got %lx)\n", + adfs_error(sb, "parent directory changed under me! (%lx but got %x)\n", ADFS_I(inode)->parent_id, dir.parent_id); ret = -EIO; goto free_out; diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d9e3bee4e653..f2ba88ab4aed 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -55,10 +55,10 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct } size >>= sb->s_blocksize_bits; - if (size > sizeof(dir->bh)/sizeof(dir->bh[0])) { + if (size > ARRAY_SIZE(dir->bh)) { /* this directory is too big for fixed bh set, must allocate */ struct buffer_head **bh_fplus = - kzalloc(size * sizeof(struct buffer_head *), + kcalloc(size, sizeof(struct buffer_head *), GFP_KERNEL); if (!bh_fplus) { adfs_error(sb, "not enough memory for" @@ -79,9 +79,8 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct dir->bh_fplus[blk] = sb_bread(sb, block); if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %X failed read for" - " offset %d, mapped block %X", - id, blk, block); + adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", + id, blk, block); goto out; } diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index f7f87e233dd9..f40006db36df 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -46,6 +46,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode) /* inode.c */ extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); +extern void bfs_dump_imap(const char *, struct super_block *); /* file.c */ extern const struct inode_operations bfs_file_inops; diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index a399e6d9dc74..08063ae0a17c 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -75,8 +75,6 @@ const struct file_operations bfs_dir_operations = { .llseek = generic_file_llseek, }; -extern void dump_imap(const char *, struct super_block *); - static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { @@ -110,7 +108,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, BFS_I(inode)->i_eblock = 0; insert_inode_hash(inode); mark_inode_dirty(inode); - dump_imap("create", s); + bfs_dump_imap("create", s); err = bfs_add_entry(dir, dentry->d_name.name, dentry->d_name.len, inode->i_ino); diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 7041ac35ace8..90bc079d9982 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -30,8 +30,6 @@ MODULE_LICENSE("GPL"); #define dprintf(x...) #endif -void dump_imap(const char *prefix, struct super_block *s); - struct inode *bfs_iget(struct super_block *sb, unsigned long ino) { struct bfs_inode *di; @@ -194,7 +192,7 @@ static void bfs_evict_inode(struct inode *inode) info->si_freeb += bi->i_eblock + 1 - bi->i_sblock; info->si_freei++; clear_bit(ino, info->si_imap); - dump_imap("delete_inode", s); + bfs_dump_imap("delete_inode", s); } /* @@ -297,7 +295,7 @@ static const struct super_operations bfs_sops = { .statfs = bfs_statfs, }; -void dump_imap(const char *prefix, struct super_block *s) +void bfs_dump_imap(const char *prefix, struct super_block *s) { #ifdef DEBUG int i; @@ -443,7 +441,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) } brelse(bh); brelse(sbh); - dump_imap("read_super", s); + bfs_dump_imap("read_super", s); return 0; out3: diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3892c1a23241..d94672a1f272 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -145,6 +145,25 @@ static int padzero(unsigned long elf_bss) #define ELF_BASE_PLATFORM NULL #endif +/* + * Use get_random_int() to implement AT_RANDOM while avoiding depletion + * of the entropy pool. + */ +static void get_atrandom_bytes(unsigned char *buf, size_t nbytes) +{ + unsigned char *p = buf; + + while (nbytes) { + unsigned int random_variable; + size_t chunk = min_t(size_t, nbytes, sizeof(random_variable)); + + random_variable = get_random_int(); + memcpy(p, &random_variable, chunk); + p += chunk; + nbytes -= chunk; + } +} + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -206,7 +225,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, /* * Generate 16 random bytes for userspace PRNG seeding. */ - get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + get_atrandom_bytes(k_rand_bytes, sizeof(k_rand_bytes)); u_rand_bytes = (elf_addr_t __user *) STACK_ALLOC(p, sizeof(k_rand_bytes)); if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index d749731dc0ee..dd333a67ebc2 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -39,13 +39,11 @@ int cachefiles_daemon_bind(struct cachefiles_cache *cache, char *args) args); /* start by checking things over */ - ASSERT(cache->fstop_percent >= 0 && - cache->fstop_percent < cache->fcull_percent && + ASSERT(cache->fstop_percent < cache->fcull_percent && cache->fcull_percent < cache->frun_percent && cache->frun_percent < 100); - ASSERT(cache->bstop_percent >= 0 && - cache->bstop_percent < cache->bcull_percent && + ASSERT(cache->bstop_percent < cache->bcull_percent && cache->bcull_percent < cache->brun_percent && cache->brun_percent < 100); diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index b078d3081d6c..ff43e5a8711f 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -222,7 +222,7 @@ static ssize_t cachefiles_daemon_write(struct file *file, if (test_bit(CACHEFILES_DEAD, &cache->flags)) return -EIO; - if (datalen < 0 || datalen > PAGE_SIZE - 1) + if (datalen > PAGE_SIZE - 1) return -EOPNOTSUPP; /* drag the command string into the kernel so we can parse it */ @@ -385,7 +385,7 @@ static int cachefiles_daemon_fstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (fstop < 0 || fstop >= cache->fcull_percent) + if (fstop >= cache->fcull_percent) return cachefiles_daemon_range_error(cache, args); cache->fstop_percent = fstop; @@ -457,7 +457,7 @@ static int cachefiles_daemon_bstop(struct cachefiles_cache *cache, char *args) if (args[0] != '%' || args[1] != '\0') return -EINVAL; - if (bstop < 0 || bstop >= cache->bcull_percent) + if (bstop >= cache->bcull_percent) return cachefiles_daemon_range_error(cache, args); cache->bstop_percent = bstop; diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 7ff866dbb89e..54ac0e8ad96c 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -38,7 +38,7 @@ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { - 1, 1, {0, 0, 0, 0, 0, 5}, {__constant_cpu_to_le32(11)} }; + 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; /* group users */ static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} }; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7d4361f40c23..692d79f2e104 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2477,14 +2477,14 @@ CIFSSMBPosixLock(const unsigned int xid, struct cifs_tcon *tcon, } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; else { if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_RDLCK)) + cpu_to_le16(CIFS_RDLCK)) pLockData->fl_type = F_RDLCK; else if (parm_data->lock_type == - __constant_cpu_to_le16(CIFS_WRLCK)) + cpu_to_le16(CIFS_WRLCK)) pLockData->fl_type = F_WRLCK; pLockData->fl_start = le64_to_cpu(parm_data->start); @@ -3276,25 +3276,25 @@ CIFSSMB_set_compression(const unsigned int xid, struct cifs_tcon *tcon, pSMB->compression_state = cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); pSMB->TotalParameterCount = 0; - pSMB->TotalDataCount = __constant_cpu_to_le32(2); + pSMB->TotalDataCount = cpu_to_le32(2); pSMB->MaxParameterCount = 0; pSMB->MaxDataCount = 0; pSMB->MaxSetupCount = 4; pSMB->Reserved = 0; pSMB->ParameterOffset = 0; - pSMB->DataCount = __constant_cpu_to_le32(2); + pSMB->DataCount = cpu_to_le32(2); pSMB->DataOffset = cpu_to_le32(offsetof(struct smb_com_transaction_compr_ioctl_req, compression_state) - 4); /* 84 */ pSMB->SetupCount = 4; - pSMB->SubCommand = __constant_cpu_to_le16(NT_TRANSACT_IOCTL); + pSMB->SubCommand = cpu_to_le16(NT_TRANSACT_IOCTL); pSMB->ParameterCount = 0; - pSMB->FunctionCode = __constant_cpu_to_le32(FSCTL_SET_COMPRESSION); + pSMB->FunctionCode = cpu_to_le32(FSCTL_SET_COMPRESSION); pSMB->IsFsctl = 1; /* FSCTL */ pSMB->IsRootFlag = 0; pSMB->Fid = fid; /* file handle always le */ /* 3 byte pad, followed by 2 byte compress state */ - pSMB->ByteCount = __constant_cpu_to_le16(5); + pSMB->ByteCount = cpu_to_le16(5); inc_rfc1001_len(pSMB, 5); rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, @@ -3430,10 +3430,10 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL, cifs_acl->version = cpu_to_le16(1); if (acl_type == ACL_TYPE_ACCESS) { cifs_acl->access_entry_count = cpu_to_le16(count); - cifs_acl->default_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->default_entry_count = cpu_to_le16(0xFFFF); } else if (acl_type == ACL_TYPE_DEFAULT) { cifs_acl->default_entry_count = cpu_to_le16(count); - cifs_acl->access_entry_count = __constant_cpu_to_le16(0xFFFF); + cifs_acl->access_entry_count = cpu_to_le16(0xFFFF); } else { cifs_dbg(FYI, "unknown ACL type %d\n", acl_type); return 0; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 01a6339a554c..03558d432f7a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1058,7 +1058,7 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile) max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; @@ -1393,7 +1393,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, max_num = (max_buf - sizeof(struct smb_hdr)) / sizeof(LOCKING_ANDX_RANGE); - buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL); if (!buf) return -ENOMEM; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 39ee32688eac..39b850786e12 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -46,7 +46,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, SESSION_SETUP_ANDX *pSMB) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - pSMB->req.VcNumber = __constant_cpu_to_le16(1); + pSMB->req.VcNumber = cpu_to_le16(1); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 3f17b4550831..e5100b8fb3a0 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -111,7 +111,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, return -EINVAL; max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -247,7 +247,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) } max_num = max_buf / sizeof(struct smb2_lock_element); - buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL); + buf = kcalloc(max_num, sizeof(struct smb2_lock_element), GFP_KERNEL); if (!buf) { free_xid(xid); return -ENOMEM; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index b8021fde987d..36867bd21d04 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -67,27 +67,27 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * indexed by command in host byte order */ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { - /* SMB2_NEGOTIATE */ __constant_cpu_to_le16(65), - /* SMB2_SESSION_SETUP */ __constant_cpu_to_le16(9), - /* SMB2_LOGOFF */ __constant_cpu_to_le16(4), - /* SMB2_TREE_CONNECT */ __constant_cpu_to_le16(16), - /* SMB2_TREE_DISCONNECT */ __constant_cpu_to_le16(4), - /* SMB2_CREATE */ __constant_cpu_to_le16(89), - /* SMB2_CLOSE */ __constant_cpu_to_le16(60), - /* SMB2_FLUSH */ __constant_cpu_to_le16(4), - /* SMB2_READ */ __constant_cpu_to_le16(17), - /* SMB2_WRITE */ __constant_cpu_to_le16(17), - /* SMB2_LOCK */ __constant_cpu_to_le16(4), - /* SMB2_IOCTL */ __constant_cpu_to_le16(49), + /* SMB2_NEGOTIATE */ cpu_to_le16(65), + /* SMB2_SESSION_SETUP */ cpu_to_le16(9), + /* SMB2_LOGOFF */ cpu_to_le16(4), + /* SMB2_TREE_CONNECT */ cpu_to_le16(16), + /* SMB2_TREE_DISCONNECT */ cpu_to_le16(4), + /* SMB2_CREATE */ cpu_to_le16(89), + /* SMB2_CLOSE */ cpu_to_le16(60), + /* SMB2_FLUSH */ cpu_to_le16(4), + /* SMB2_READ */ cpu_to_le16(17), + /* SMB2_WRITE */ cpu_to_le16(17), + /* SMB2_LOCK */ cpu_to_le16(4), + /* SMB2_IOCTL */ cpu_to_le16(49), /* BB CHECK this ... not listed in documentation */ - /* SMB2_CANCEL */ __constant_cpu_to_le16(0), - /* SMB2_ECHO */ __constant_cpu_to_le16(4), - /* SMB2_QUERY_DIRECTORY */ __constant_cpu_to_le16(9), - /* SMB2_CHANGE_NOTIFY */ __constant_cpu_to_le16(9), - /* SMB2_QUERY_INFO */ __constant_cpu_to_le16(9), - /* SMB2_SET_INFO */ __constant_cpu_to_le16(2), + /* SMB2_CANCEL */ cpu_to_le16(0), + /* SMB2_ECHO */ cpu_to_le16(4), + /* SMB2_QUERY_DIRECTORY */ cpu_to_le16(9), + /* SMB2_CHANGE_NOTIFY */ cpu_to_le16(9), + /* SMB2_QUERY_INFO */ cpu_to_le16(9), + /* SMB2_SET_INFO */ cpu_to_le16(2), /* BB FIXME can also be 44 for lease break */ - /* SMB2_OPLOCK_BREAK */ __constant_cpu_to_le16(24) + /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 081529f316e3..59437c50472f 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -590,7 +590,7 @@ smb2_clone_range(const unsigned int xid, goto cchunk_out; /* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = __constant_cpu_to_le32(1); + pcchunk->ChunkCount = cpu_to_le32(1); pcchunk->Reserved = 0; pcchunk->Reserved2 = 0; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 768cddb72bdb..2057250c76d3 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1355,7 +1355,7 @@ SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, char *ret_data = NULL; fsctl_input.CompressionState = - __constant_cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); + cpu_to_le16(COMPRESSION_FORMAT_DEFAULT); rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid, FSCTL_SET_COMPRESSION, true /* is_fsctl */, diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 69f3595d3952..d03adad014db 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -85,7 +85,7 @@ /* BB FIXME - analyze following length BB */ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ -#define SMB2_PROTO_NUMBER __constant_cpu_to_le32(0x424d53fe) +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) /* * SMB2 Header Definition @@ -96,7 +96,7 @@ * */ -#define SMB2_HEADER_STRUCTURE_SIZE __constant_cpu_to_le16(64) +#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ @@ -137,16 +137,16 @@ struct smb2_transform_hdr { } __packed; /* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM __constant_cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) /* * SMB2 flag definitions */ -#define SMB2_FLAGS_SERVER_TO_REDIR __constant_cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND __constant_cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS __constant_cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED __constant_cpu_to_le32(0x00000008) -#define SMB2_FLAGS_DFS_OPERATIONS __constant_cpu_to_le32(0x10000000) +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) /* * Definitions for SMB2 Protocol Data Units (network frames) @@ -157,7 +157,7 @@ struct smb2_transform_hdr { * */ -#define SMB2_ERROR_STRUCTURE_SIZE2 __constant_cpu_to_le16(9) +#define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { struct smb2_hdr hdr; @@ -500,12 +500,12 @@ struct create_context { #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 -#define SMB2_LEASE_NONE __constant_cpu_to_le32(0x00) -#define SMB2_LEASE_READ_CACHING __constant_cpu_to_le32(0x01) -#define SMB2_LEASE_HANDLE_CACHING __constant_cpu_to_le32(0x02) -#define SMB2_LEASE_WRITE_CACHING __constant_cpu_to_le32(0x04) +#define SMB2_LEASE_NONE cpu_to_le32(0x00) +#define SMB2_LEASE_READ_CACHING cpu_to_le32(0x01) +#define SMB2_LEASE_HANDLE_CACHING cpu_to_le32(0x02) +#define SMB2_LEASE_WRITE_CACHING cpu_to_le32(0x04) -#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS __constant_cpu_to_le32(0x02) +#define SMB2_LEASE_FLAG_BREAK_IN_PROGRESS cpu_to_le32(0x02) #define SMB2_LEASE_KEY_SIZE 16 diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 1da168c61d35..278f8fdeb9ef 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/list.h> #include <linux/sched.h> diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2849f41e72a2..1326d38960db 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -13,7 +13,7 @@ #include <linux/fs.h> #include <linux/stat.h> #include <linux/errno.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/coda.h> diff --git a/fs/coda/dir.c b/fs/coda/dir.c index cd8a63238b11..9c3dedc000d1 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -19,8 +19,7 @@ #include <linux/string.h> #include <linux/spinlock.h> #include <linux/namei.h> - -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/file.c b/fs/coda/file.c index 9e83b7790212..d244d743a232 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -18,7 +18,7 @@ #include <linux/spinlock.h> #include <linux/string.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/inode.c b/fs/coda/inode.c index fe3afb2de880..b945410bfcd5 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -21,9 +21,7 @@ #include <linux/vfs.h> #include <linux/slab.h> #include <linux/pid_namespace.h> - -#include <asm/uaccess.h> - +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/vmalloc.h> diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 3f5de96bbb58..4326d172fc27 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -16,7 +16,7 @@ #include <linux/string.h> #include <linux/namei.h> #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 5c1e4242368b..822629126e89 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -40,7 +40,7 @@ #include <linux/pid_namespace.h> #include <asm/io.h> #include <asm/poll.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/coda.h> #include <linux/coda_psdev.h> diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 21fcf8dcb9cd..5bb6e27298a4 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -27,7 +27,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/vfs.h> diff --git a/fs/compat.c b/fs/compat.c index 66d3d3c6b4b2..6917fdba382c 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -562,7 +562,7 @@ ssize_t compat_rw_copy_check_uvector(int type, goto out; ret = -EINVAL; - if (nr_segs > UIO_MAXIOV || nr_segs < 0) + if (nr_segs > UIO_MAXIOV) goto out; if (nr_segs > fast_segs) { ret = -ENOMEM; diff --git a/fs/coredump.c b/fs/coredump.c index 0b2528fb640e..a93f7e6ea4cf 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -306,7 +306,7 @@ static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, if (unlikely(nr < 0)) return nr; - tsk->flags = PF_DUMPCORE; + tsk->flags |= PF_DUMPCORE; if (atomic_read(&mm->mm_users) == nr + 1) goto done; /* diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index ddcfe590b8a8..355c522f3585 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -11,6 +11,8 @@ * The actual compression is based on zlib, see the other files. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -21,7 +23,7 @@ #include <linux/vfs.h> #include <linux/mutex.h> #include <uapi/linux/cramfs_fs.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "internal.h" @@ -153,7 +155,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, static unsigned char read_buffers[READ_BUFFERS][BUFFER_SIZE]; static unsigned buffer_blocknr[READ_BUFFERS]; -static struct super_block * buffer_dev[READ_BUFFERS]; +static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* @@ -205,6 +207,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { wait_on_page_locked(page); if (!PageUptodate(page)) { @@ -223,6 +226,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i data = read_buffers[buffer]; for (i = 0; i < BLKS_PER_BUF; i++) { struct page *page = pages[i]; + if (page) { memcpy(data, kmap(page), PAGE_CACHE_SIZE); kunmap(page); @@ -237,6 +241,7 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + kill_block_super(sb); kfree(sbi); } @@ -277,7 +282,7 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* check for wrong endianness */ if (super.magic == CRAMFS_MAGIC_WEND) { if (!silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); return -EINVAL; } @@ -287,22 +292,22 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&read_mutex); if (super.magic != CRAMFS_MAGIC) { if (super.magic == CRAMFS_MAGIC_WEND && !silent) - printk(KERN_ERR "cramfs: wrong endianness\n"); + pr_err("wrong endianness\n"); else if (!silent) - printk(KERN_ERR "cramfs: wrong magic\n"); + pr_err("wrong magic\n"); return -EINVAL; } } /* get feature flags first */ if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { - printk(KERN_ERR "cramfs: unsupported filesystem features\n"); + pr_err("unsupported filesystem features\n"); return -EINVAL; } /* Check that the root inode is in a sane state */ if (!S_ISDIR(super.root.mode)) { - printk(KERN_ERR "cramfs: root is not a directory\n"); + pr_err("root is not a directory\n"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ @@ -310,23 +315,23 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) root_offset = super.root.offset << 2; if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { - sbi->size=super.size; - sbi->blocks=super.fsid.blocks; - sbi->files=super.fsid.files; + sbi->size = super.size; + sbi->blocks = super.fsid.blocks; + sbi->files = super.fsid.files; } else { - sbi->size=1<<28; - sbi->blocks=0; - sbi->files=0; + sbi->size = 1<<28; + sbi->blocks = 0; + sbi->files = 0; } - sbi->magic=super.magic; - sbi->flags=super.flags; + sbi->magic = super.magic; + sbi->flags = super.flags; if (root_offset == 0) - printk(KERN_INFO "cramfs: empty filesystem"); + pr_info("empty filesystem"); else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { - printk(KERN_ERR "cramfs: bad root offset %lu\n", root_offset); + pr_err("bad root offset %lu\n", root_offset); return -EINVAL; } @@ -425,7 +430,7 @@ static int cramfs_readdir(struct file *file, struct dir_context *ctx) /* * Lookup and fill in the inode data.. */ -static struct dentry * cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +static struct dentry *cramfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { unsigned int offset = 0; struct inode *inode = NULL; @@ -483,7 +488,7 @@ out: return NULL; } -static int cramfs_readpage(struct file *file, struct page * page) +static int cramfs_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; u32 maxblock; @@ -511,7 +516,7 @@ static int cramfs_readpage(struct file *file, struct page * page) if (compr_len == 0) ; /* hole */ else if (unlikely(compr_len > (PAGE_CACHE_SIZE << 1))) { - pr_err("cramfs: bad compressed blocksize %u\n", + pr_err("bad compressed blocksize %u\n", compr_len); goto err; } else { diff --git a/fs/cramfs/uncompress.c b/fs/cramfs/uncompress.c index 1760c1b84d97..ec4f1d4fdad0 100644 --- a/fs/cramfs/uncompress.c +++ b/fs/cramfs/uncompress.c @@ -15,6 +15,8 @@ * then is used by multiple filesystems. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/errno.h> #include <linux/vmalloc.h> @@ -37,7 +39,7 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) err = zlib_inflateReset(&stream); if (err != Z_OK) { - printk("zlib_inflateReset error %d\n", err); + pr_err("zlib_inflateReset error %d\n", err); zlib_inflateEnd(&stream); zlib_inflateInit(&stream); } @@ -48,8 +50,8 @@ int cramfs_uncompress_block(void *dst, int dstlen, void *src, int srclen) return stream.total_out; err: - printk("Error %d while decompressing!\n", err); - printk("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); + pr_err("Error %d while decompressing!\n", err); + pr_err("%p(%d)->%p(%d)\n", src, srclen, dst, dstlen); return -EIO; } @@ -57,7 +59,7 @@ int cramfs_uncompress_init(void) { if (!initialized++) { stream.workspace = vmalloc(zlib_inflate_workspacesize()); - if ( !stream.workspace ) { + if (!stream.workspace) { initialized = 0; return -ENOMEM; } diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8d77ba7b1756..1323c568e362 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -718,16 +718,11 @@ static const struct file_operations waiters_fops = { void dlm_delete_debug_file(struct dlm_ls *ls) { - if (ls->ls_debug_rsb_dentry) - debugfs_remove(ls->ls_debug_rsb_dentry); - if (ls->ls_debug_waiters_dentry) - debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_locks_dentry) - debugfs_remove(ls->ls_debug_locks_dentry); - if (ls->ls_debug_all_dentry) - debugfs_remove(ls->ls_debug_all_dentry); - if (ls->ls_debug_toss_dentry) - debugfs_remove(ls->ls_debug_toss_dentry); + debugfs_remove(ls->ls_debug_rsb_dentry); + debugfs_remove(ls->ls_debug_waiters_dentry); + debugfs_remove(ls->ls_debug_locks_dentry); + debugfs_remove(ls->ls_debug_all_dentry); + debugfs_remove(ls->ls_debug_toss_dentry); } int dlm_create_debug_file(struct dlm_ls *ls) diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 356c044e2cd3..bbee8f063dfa 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -12,7 +12,8 @@ #include "efs.h" -static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) { +static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) +{ struct buffer_head *bh; int slot, namelen; @@ -40,10 +41,10 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { pr_err("%s(): invalid directory block\n", __func__); brelse(bh); - return(0); + return 0; } - for(slot = 0; slot < dirblock->slots; slot++) { + for (slot = 0; slot < dirblock->slots; slot++) { dirslot = (struct efs_dentry *) (((char *) bh->b_data) + EFS_SLOTAT(dirblock, slot)); namelen = dirslot->namelen; @@ -52,12 +53,12 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) if ((namelen == len) && (!memcmp(name, nameptr, len))) { inodenum = be32_to_cpu(dirslot->inode); brelse(bh); - return(inodenum); + return inodenum; } } brelse(bh); } - return(0); + return 0; } struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) diff --git a/fs/exec.c b/fs/exec.c index a3d33fe592d6..2ef2751f5a8d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -368,10 +368,6 @@ static int bprm_mm_init(struct linux_binprm *bprm) if (!mm) goto err; - err = init_new_context(current, mm); - if (err) - goto err; - err = __bprm_mm_init(bprm); if (err) goto err; diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 7f20f25c232c..84529b8a331b 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -116,7 +116,7 @@ static int _sp2d_alloc(unsigned pages_in_unit, unsigned group_width, num_a1pa = min_t(unsigned, PAGE_SIZE / sizeof__a1pa, pages_in_unit - i); - __a1pa = kzalloc(num_a1pa * sizeof__a1pa, GFP_KERNEL); + __a1pa = kcalloc(num_a1pa, sizeof__a1pa, GFP_KERNEL); if (unlikely(!__a1pa)) { ORE_DBGMSG("!! Failed to _alloc_1p_arrays=%d\n", num_a1pa); diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a8bc47f75fa0..5b6e9f246233 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -107,7 +107,10 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } if (!journal) { - ret = generic_file_fsync(file, start, end, datasync); + if (test_opt(inode->i_sb, BARRIER)) + ret = generic_file_fsync(file, start, end, datasync); + else + ret = __generic_file_fsync(file, start, end, datasync); if (!ret && !hlist_empty(&inode->i_dentry)) ret = ext4_sync_parent(inode); goto out; diff --git a/fs/fscache/main.c b/fs/fscache/main.c index a31b83c5cbd9..b39d487ccfb0 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -67,7 +67,7 @@ static int fscache_max_active_sysctl(struct ctl_table *table, int write, return ret; } -struct ctl_table fscache_sysctls[] = { +static struct ctl_table fscache_sysctls[] = { { .procname = "object_max_active", .data = &fscache_object_max_active, @@ -87,7 +87,7 @@ struct ctl_table fscache_sysctls[] = { {} }; -struct ctl_table fscache_sysctls_root[] = { +static struct ctl_table fscache_sysctls_root[] = { { .procname = "fscache", .mode = 0555, diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 32602c667b4a..7892e6fddb66 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -38,21 +38,30 @@ int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, return hfsplus_strcmp(&k1->cat.name, &k2->cat.name); } -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, - u32 parent, struct qstr *str) +/* Generates key for catalog file/folders record. */ +int hfsplus_cat_build_key(struct super_block *sb, + hfsplus_btree_key *key, u32 parent, struct qstr *str) { - int len; + int len, err; key->cat.parent = cpu_to_be32(parent); - if (str) { - hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, - str->name, str->len); - len = be16_to_cpu(key->cat.name.length); - } else { - key->cat.name.length = 0; - len = 0; - } + err = hfsplus_asc2uni(sb, &key->cat.name, HFSPLUS_MAX_STRLEN, + str->name, str->len); + if (unlikely(err < 0)) + return err; + + len = be16_to_cpu(key->cat.name.length); key->key_len = cpu_to_be16(6 + 2 * len); + return 0; +} + +/* Generates key for catalog thread record. */ +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent) +{ + key->cat.parent = cpu_to_be32(parent); + key->cat.name.length = 0; + key->key_len = cpu_to_be16(6); } static void hfsplus_cat_build_key_uni(hfsplus_btree_key *key, u32 parent, @@ -167,11 +176,16 @@ static int hfsplus_fill_cat_thread(struct super_block *sb, hfsplus_cat_entry *entry, int type, u32 parentid, struct qstr *str) { + int err; + entry->type = cpu_to_be16(type); entry->thread.reserved = 0; entry->thread.parentID = cpu_to_be32(parentid); - hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, + err = hfsplus_asc2uni(sb, &entry->thread.nodeName, HFSPLUS_MAX_STRLEN, str->name, str->len); + if (unlikely(err < 0)) + return err; + return 10 + be16_to_cpu(entry->thread.nodeName.length) * 2; } @@ -183,7 +197,7 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, int err; u16 type; - hfsplus_cat_build_key(sb, fd->search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd->search_key, cnid); err = hfs_brec_read(fd, &tmp, sizeof(hfsplus_cat_entry)); if (err) return err; @@ -250,11 +264,16 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) return err; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, S_ISDIR(inode->i_mode) ? HFSPLUS_FOLDER_THREAD : HFSPLUS_FILE_THREAD, dir->i_ino, str); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto err2; + } + err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -265,7 +284,10 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, if (err) goto err2; - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto err1; + entry_size = hfsplus_cat_build_record(&entry, cnid, inode); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err != -ENOENT) { @@ -288,7 +310,7 @@ int hfsplus_create_cat(u32 cnid, struct inode *dir, return 0; err1: - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); if (!hfs_brec_find(&fd, hfs_find_rec_by_key)) hfs_brec_remove(&fd); err2: @@ -313,7 +335,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (!str) { int len; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -329,7 +351,9 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) off + 2, len); fd.search_key->key_len = cpu_to_be16(6 + len); } else - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); + if (unlikely(err)) + goto out; err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) @@ -360,7 +384,7 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) if (err) goto out; - hfsplus_cat_build_key(sb, fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, cnid); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; @@ -405,7 +429,11 @@ int hfsplus_rename_cat(u32 cnid, dst_fd = src_fd; /* find the old dir entry and read the data */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -419,7 +447,11 @@ int hfsplus_rename_cat(u32 cnid, type = be16_to_cpu(entry.type); /* create new dir entry with the data from the old entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, dst_dir->i_ino, dst_name); + err = hfsplus_cat_build_key(sb, dst_fd.search_key, + dst_dir->i_ino, dst_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) @@ -436,7 +468,11 @@ int hfsplus_rename_cat(u32 cnid, dst_dir->i_mtime = dst_dir->i_ctime = CURRENT_TIME_SEC; /* finally remove the old entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, src_dir->i_ino, src_name); + err = hfsplus_cat_build_key(sb, src_fd.search_key, + src_dir->i_ino, src_name); + if (unlikely(err)) + goto out; + err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -449,7 +485,7 @@ int hfsplus_rename_cat(u32 cnid, src_dir->i_mtime = src_dir->i_ctime = CURRENT_TIME_SEC; /* remove old thread entry */ - hfsplus_cat_build_key(sb, src_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, src_fd.search_key, cnid); err = hfs_brec_find(&src_fd, hfs_find_rec_by_key); if (err) goto out; @@ -459,9 +495,14 @@ int hfsplus_rename_cat(u32 cnid, goto out; /* create new thread entry */ - hfsplus_cat_build_key(sb, dst_fd.search_key, cnid, NULL); + hfsplus_cat_build_key_with_cnid(sb, dst_fd.search_key, cnid); entry_size = hfsplus_fill_cat_thread(sb, &entry, type, dst_dir->i_ino, dst_name); + if (unlikely(entry_size < 0)) { + err = entry_size; + goto out; + } + err = hfs_brec_find(&dst_fd, hfs_find_rec_by_key); if (err != -ENOENT) { if (!err) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index 610a3260bef1..435bea231cc6 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -44,7 +44,10 @@ static struct dentry *hfsplus_lookup(struct inode *dir, struct dentry *dentry, err = hfs_find_init(HFSPLUS_SB(sb)->cat_tree, &fd); if (err) return ERR_PTR(err); - hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, &dentry->d_name); + err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, + &dentry->d_name); + if (unlikely(err < 0)) + goto fail; again: err = hfs_brec_read(&fd, &entry, sizeof(entry)); if (err) { @@ -97,9 +100,11 @@ again: be32_to_cpu(entry.file.permissions.dev); str.len = sprintf(name, "iNode%d", linkid); str.name = name; - hfsplus_cat_build_key(sb, fd.search_key, + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_SB(sb)->hidden_dir->i_ino, &str); + if (unlikely(err < 0)) + goto fail; goto again; } } else if (!dentry->d_fsdata) @@ -145,7 +150,7 @@ static int hfsplus_readdir(struct file *file, struct dir_context *ctx) err = -ENOMEM; goto out; } - hfsplus_cat_build_key(sb, fd.search_key, inode->i_ino, NULL); + hfsplus_cat_build_key_with_cnid(sb, fd.search_key, inode->i_ino); err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) goto out; diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index eb5e059f481a..b0441d65fa54 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -443,8 +443,10 @@ int hfsplus_cat_case_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); int hfsplus_cat_bin_cmp_key(const hfsplus_btree_key *k1, const hfsplus_btree_key *k2); -void hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, +int hfsplus_cat_build_key(struct super_block *sb, hfsplus_btree_key *key, u32 parent, struct qstr *str); +void hfsplus_cat_build_key_with_cnid(struct super_block *sb, + hfsplus_btree_key *key, u32 parent); void hfsplus_cat_set_perms(struct inode *inode, struct hfsplus_perm *perms); int hfsplus_find_cat(struct super_block *sb, u32 cnid, struct hfs_find_data *fd); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 4cf2024b87da..593af2fdcc2d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -515,7 +515,9 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) err = hfs_find_init(sbi->cat_tree, &fd); if (err) goto out_put_root; - hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + err = hfsplus_cat_build_key(sb, fd.search_key, HFSPLUS_ROOT_CNID, &str); + if (unlikely(err < 0)) + goto out_put_root; if (!hfs_brec_read(&fd, &entry, sizeof(entry))) { hfs_find_exit(&fd); if (entry.type != cpu_to_be16(HFSPLUS_FOLDER)) diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c index f36fc010fccb..2923a7bd82ac 100644 --- a/fs/hpfs/dnode.c +++ b/fs/hpfs/dnode.c @@ -545,12 +545,13 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno) struct dnode *d1; struct quad_buffer_head qbh1; if (hpfs_sb(i->i_sb)->sb_chk) - if (up != i->i_ino) { - hpfs_error(i->i_sb, - "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", - dno, up, (unsigned long)i->i_ino); - return; - } + if (up != i->i_ino) { + hpfs_error(i->i_sb, + "bad pointer to fnode, dnode %08x, pointing to %08x, should be %08lx", + dno, up, + (unsigned long)i->i_ino); + return; + } if ((d1 = hpfs_map_dnode(i->i_sb, down, &qbh1))) { d1->up = cpu_to_le32(up); d1->root_dnode = 1; @@ -1061,8 +1062,8 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno, hpfs_brelse4(qbh); if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, dno, &c1, &c2, "map_fnode_dirent #1")) { - kfree(name2); - return NULL; + kfree(name2); + return NULL; } goto go_down; } diff --git a/fs/isofs/Makefile b/fs/isofs/Makefile index bf162f0942d5..47a68e357512 100644 --- a/fs/isofs/Makefile +++ b/fs/isofs/Makefile @@ -8,3 +8,5 @@ isofs-objs-y := namei.o inode.o dir.o util.o rock.o export.o isofs-objs-$(CONFIG_JOLIET) += joliet.o isofs-objs-$(CONFIG_ZISOFS) += compress.o isofs-objs := $(isofs-objs-y) + +# ccflags-y := -DDEBUG_FLAGS=1 diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c index 592e5115a561..c9ba688dc407 100644 --- a/fs/isofs/compress.c +++ b/fs/isofs/compress.c @@ -15,6 +15,8 @@ * * Transparent decompression of files on an iso9660 filesystem */ +#define DEBUG +#define pr_fmt(fmt) "zisofs: " fmt #include <linux/module.h> #include <linux/init.h> @@ -110,7 +112,7 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, *errp = -ENOMEM; else *errp = -EIO; - printk(KERN_DEBUG "zisofs: zisofs_inflateInit returned %d\n", + pr_debug("zisofs_inflateInit returned %d\n", zerr); goto z_eio; } @@ -154,12 +156,12 @@ static loff_t zisofs_uncompress_block(struct inode *inode, loff_t block_start, if (zerr == Z_MEM_ERROR) *errp = -ENOMEM; else { - printk(KERN_DEBUG + pr_debug( "zisofs: zisofs_inflate returned" " %d, inode = %lu," " page idx = %d, bh idx = %d," - " avail_in = %d," - " avail_out = %d\n", + " avail_in = %ld," + " avail_out = %ld\n", zerr, inode->i_ino, curpage, curbh, stream.avail_in, stream.avail_out); diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 12088d8de3fa..44d1053dfad5 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -12,7 +12,7 @@ * Documentation/filesystems/nfs/Exporting * fs/exportfs/expfs.c. */ - +#define pr_fmt(fmt) "ISOFS: " fmt #include "isofs.h" static struct dentry * @@ -52,8 +52,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) /* "child" must always be a directory. */ if (!S_ISDIR(child_inode->i_mode)) { - printk(KERN_ERR "isofs: isofs_export_get_parent(): " - "child is not a directory!\n"); + pr_err("%s(): child is not a directory!\n", __func__); rv = ERR_PTR(-EACCES); goto out; } @@ -62,8 +61,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) * it is not zero, it means the directory failed to be * normalized for some reason. */ if (e_child_inode->i_iget5_offset != 0) { - printk(KERN_ERR "isofs: isofs_export_get_parent(): " - "child directory not normalized!\n"); + pr_err("isofs_export_get_parent(): child directory not normalized!\n"); rv = ERR_PTR(-EACCES); goto out; } @@ -89,8 +87,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) /* Verify it is in fact the ".." entry. */ if ((isonum_711(de->name_len) != 1) || (de->name[0] != 1)) { - printk(KERN_ERR "isofs: Unable to find the \"..\" " - "directory for NFS.\n"); + pr_err("Unable to find the \"..\" directory for NFS.\n"); rv = ERR_PTR(-EACCES); goto out; } diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 4556ce1af5b0..cc23d86e174a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -10,6 +10,8 @@ * 2004 Paul Serice - Inode Support pushed out from 4GB to 128GB * 2004 Paul Serice - NFS Export Operations */ +#define DEBUG +#define pr_fmt(fmt) "ISOFS: " fmt #include <linux/init.h> #include <linux/module.h> @@ -528,23 +530,25 @@ static unsigned int isofs_get_last_session(struct super_block *sb, s32 session) Te.cdte_format=CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMREADTOCENTRY, (unsigned long) &Te); if (!i) { - printk(KERN_DEBUG "ISOFS: Session %d start %d type %d\n", + pr_debug("Session %d start %d type %d\n", session, Te.cdte_addr.lba, Te.cdte_ctrl&CDROM_DATA_TRACK); if ((Te.cdte_ctrl&CDROM_DATA_TRACK) == 4) return Te.cdte_addr.lba; } - printk(KERN_ERR "ISOFS: Invalid session number or type of track\n"); + pr_err("Invalid session number or type of track\n"); } i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long) &ms_info); if (session > 0) - printk(KERN_ERR "ISOFS: Invalid session number\n"); + pr_err("Invalid session number\n"); #if 0 - printk(KERN_DEBUG "isofs.inode: CDROMMULTISESSION: rc=%d\n",i); + pr_debug("isofs.inode: CDROMMULTISESSION: rc=%d\n", i); if (i==0) { - printk(KERN_DEBUG "isofs.inode: XA disk: %s\n",ms_info.xa_flag?"yes":"no"); - printk(KERN_DEBUG "isofs.inode: vol_desc_start = %d\n", ms_info.addr.lba); + pr_debug("isofs.inode: XA disk: %s\n", + ms_info.xa_flag?"yes":"no"); + pr_debug("isofs.inode: vol_desc_start = %d\n", + ms_info.addr.lba); } #endif if (i==0) @@ -672,8 +676,7 @@ static int isofs_fill_super(struct super_block *s, void *data, int silent) else if (sec->escape[2] == 0x45) joliet_level = 3; - printk(KERN_DEBUG "ISO 9660 Extensions: " - "Microsoft Joliet Level %d\n", + pr_debug("ISO 9660 Extensions: Microsoft Joliet Level %d\n", joliet_level); } goto root_found; @@ -771,11 +774,11 @@ root_found: isonum_711(rootp->ext_attr_length); sbi->s_firstdatazone = first_data_zone; #ifndef BEQUIET - printk(KERN_DEBUG "ISOFS: Max size:%ld Log zone size:%ld\n", + pr_debug("Max size:%ld Log zone size:%ld\n", sbi->s_max_size, 1UL << sbi->s_log_zone_size); - printk(KERN_DEBUG "ISOFS: First datazone:%ld\n", sbi->s_firstdatazone); + pr_debug("First datazone:%ld\n", sbi->s_firstdatazone); if(sbi->s_high_sierra) - printk(KERN_DEBUG "ISOFS: Disc in High Sierra format.\n"); + pr_debug("Disc in High Sierra format.\n"); #endif /* @@ -878,9 +881,7 @@ root_found: */ if (sbi->s_rock == 1 && joliet_level && rootdir_empty(s, sbi->s_firstdatazone)) { - printk(KERN_NOTICE - "ISOFS: primary root directory is empty. " - "Disabling Rock Ridge and switching to Joliet."); + pr_notice("primary root directory is empty. Disabling Rock Ridge and switching to Joliet."); sbi->s_rock = 0; } @@ -898,8 +899,7 @@ root_found: sbi->s_rock = 0; if (sbi->s_firstdatazone != first_data_zone) { sbi->s_firstdatazone = first_data_zone; - printk(KERN_DEBUG - "ISOFS: changing to secondary root\n"); + pr_debug("changing to secondary root\n"); iput(inode); inode = isofs_iget(s, sbi->s_firstdatazone, 0); if (IS_ERR(inode)) @@ -918,9 +918,8 @@ root_found: /* Make sure the root inode is a directory */ if (!S_ISDIR(inode->i_mode)) { - printk(KERN_WARNING - "isofs_fill_super: root inode is not a directory. " - "Corrupted media?\n"); + pr_warn("%s: root inode is not a directory. Corrupted media?\n", + __func__); goto out_iput; } @@ -952,27 +951,26 @@ out_iput: out_no_root: error = PTR_ERR(inode); if (error != -ENOMEM) - printk(KERN_WARNING "%s: get root inode failed\n", __func__); + pr_warn("%s: get root inode failed\n", __func__); out_no_inode: #ifdef CONFIG_JOLIET unload_nls(sbi->s_nls_iocharset); #endif goto out_freesbi; out_no_read: - printk(KERN_WARNING "%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", + pr_warn("%s: bread failed, dev=%s, iso_blknum=%d, block=%d\n", __func__, s->s_id, iso_blknum, block); goto out_freebh; out_bad_zone_size: - printk(KERN_WARNING "ISOFS: Bad logical zone size %ld\n", - sbi->s_log_zone_size); + pr_warn("Bad logical zone size %ld\n", sbi->s_log_zone_size); goto out_freebh; out_bad_size: - printk(KERN_WARNING "ISOFS: Logical zone size(%d) < hardware blocksize(%u)\n", + pr_warn("Logical zone size(%d) < hardware blocksize(%u)\n", orig_zonesize, opt.blocksize); goto out_freebh; out_unknown_format: if (!silent) - printk(KERN_WARNING "ISOFS: Unable to identify CD-ROM format.\n"); + pr_warn("Unable to identify CD-ROM format.\n"); out_freebh: brelse(bh); @@ -1021,7 +1019,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock, error = -EIO; rv = 0; if (iblock != b_off) { - printk(KERN_DEBUG "%s: block number too large\n", __func__); + pr_debug("%s: block number too large\n", __func__); goto abort; } @@ -1042,7 +1040,7 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock, * I/O errors. */ if (b_off > ((inode->i_size + PAGE_CACHE_SIZE - 1) >> ISOFS_BUFFER_BITS(inode))) { - printk(KERN_DEBUG "%s: block >= EOF (%lu, %llu)\n", + pr_debug("%s: block >= EOF (%lu, %llu)\n", __func__, b_off, (unsigned long long)inode->i_size); goto abort; @@ -1068,12 +1066,11 @@ int isofs_get_blocks(struct inode *inode, sector_t iblock, iput(ninode); if (++section > 100) { - printk(KERN_DEBUG "%s: More than 100 file sections ?!?" - " aborting...\n", __func__); - printk(KERN_DEBUG "%s: block=%lu firstext=%u sect_size=%u " - "nextblk=%lu nextoff=%lu\n", __func__, - b_off, firstext, (unsigned) sect_size, - nextblk, nextoff); + pr_debug("%s: More than 100 file sections ?!? aborting...\n", + __func__); + pr_debug("%s: block=%lu firstext=%u sect_size=%u nextblk=%lu nextoff=%lu\n", + __func__, b_off, firstext, + (unsigned) sect_size, nextblk, nextoff); goto abort; } } @@ -1105,7 +1102,7 @@ static int isofs_get_block(struct inode *inode, sector_t iblock, int ret; if (create) { - printk(KERN_DEBUG "%s: Kernel tries to allocate a block\n", __func__); + pr_debug("%s: Kernel tries to allocate a block\n", __func__); return -EROFS; } @@ -1248,13 +1245,12 @@ out_nomem: return -ENOMEM; out_noread: - printk(KERN_INFO "ISOFS: unable to read i-node block %lu\n", block); + pr_info("unable to read i-node block %lu\n", block); kfree(tmpde); return -EIO; out_toomany: - printk(KERN_INFO "%s: More than 100 file sections ?!?, aborting...\n" - "isofs_read_level3_size: inode=%lu\n", + pr_info("%s: More than 100 file sections ?!?, aborting...\n isofs_read_level3_size: inode=%lu\n", __func__, inode->i_ino); goto out; } @@ -1289,7 +1285,7 @@ static int isofs_read_inode(struct inode *inode) tmpde = kmalloc(de_len, GFP_KERNEL); if (tmpde == NULL) { - printk(KERN_INFO "%s: out of memory\n", __func__); + pr_info("%s: out of memory\n", __func__); ret = -ENOMEM; goto fail; } @@ -1364,24 +1360,23 @@ static int isofs_read_inode(struct inode *inode) inode->i_size &= 0x00ffffff; if (de->interleave[0]) { - printk(KERN_DEBUG "ISOFS: Interleaved files not (yet) supported.\n"); + pr_debug("Interleaved files not (yet) supported.\n"); inode->i_size = 0; } /* I have no idea what file_unit_size is used for, so we will flag it for now */ if (de->file_unit_size[0] != 0) { - printk(KERN_DEBUG "ISOFS: File unit size != 0 for ISO file (%ld).\n", - inode->i_ino); + pr_debug("File unit size != 0 for ISO file (%ld).\n", + inode->i_ino); } /* I have no idea what other flag bits are used for, so we will flag it for now */ -#ifdef DEBUG +#ifdef DEBUG_FLAGS if((de->flags[-high_sierra] & ~2)!= 0){ - printk(KERN_DEBUG "ISOFS: Unusual flag settings for ISO file " - "(%ld %x).\n", - inode->i_ino, de->flags[-high_sierra]); + pr_debug("Unusual flag settings for ISO file (%ld %x).\n", + inode->i_ino, de->flags[-high_sierra]); } #endif @@ -1450,7 +1445,7 @@ out: return ret; out_badread: - printk(KERN_WARNING "ISOFS: unable to read i-node block\n"); + pr_warn("unable to read i-node block\n"); fail: goto out; } @@ -1541,6 +1536,7 @@ MODULE_ALIAS("iso9660"); static int __init init_iso9660_fs(void) { int err = init_inodecache(); + if (err) goto out; #ifdef CONFIG_ZISOFS diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 95295640d9c8..c5ed09733112 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -113,9 +113,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, dpnt = de->name; /* Basic sanity check, whether name doesn't exceed dir entry */ if (de_len < dlen + sizeof(struct iso_directory_record)) { - printk(KERN_NOTICE "iso9660: Corrupted directory entry" - " in block %lu of inode %lu\n", block, - dir->i_ino); + pr_notice("Corrupted directory entry in block %lu of inode %lu\n", + block, dir->i_ino); return 0; } diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index c0bf42472e40..b13119556e5d 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -5,6 +5,8 @@ * * Rock Ridge Extensions to iso9660 */ +#define DEBUG +#define pr_fmt(fmt) "ISOFS: rock: " fmt #include <linux/slab.h> #include <linux/pagemap.h> @@ -89,9 +91,8 @@ static int rock_continue(struct rock_state *rs) if ((unsigned)rs->cont_offset > blocksize - min_de_size || (unsigned)rs->cont_size > blocksize || (unsigned)(rs->cont_offset + rs->cont_size) > blocksize) { - printk(KERN_NOTICE "rock: corrupted directory entry. " - "extent=%d, offset=%d, size=%d\n", - rs->cont_extent, rs->cont_offset, rs->cont_size); + pr_notice("corrupted directory entry. extent=%d, offset=%d, size=%d\n", + rs->cont_extent, rs->cont_offset, rs->cont_size); ret = -EIO; goto out; } @@ -117,7 +118,7 @@ static int rock_continue(struct rock_state *rs) rs->cont_offset = 0; return 0; } - printk("Unable to read rock-ridge attributes\n"); + pr_warn("Unable to read rock-ridge attributes\n"); } out: kfree(rs->buffer); @@ -176,10 +177,9 @@ static int rock_check_overflow(struct rock_state *rs, int sig) } len += offsetof(struct rock_ridge, u); if (len > rs->len) { - printk(KERN_NOTICE "rock: directory entry would overflow " - "storage\n"); - printk(KERN_NOTICE "rock: sig=0x%02x, size=%d, remaining=%d\n", - sig, len, rs->len); + pr_notice("directory entry would overflow storage\n"); + pr_notice("sig=0x%02x, size=%d, remaining=%d\n", + sig, len, rs->len); return -EIO; } return 0; @@ -257,7 +257,7 @@ repeat: break; if (rr->u.NM.flags & ~1) { - printk("Unsupported NM flag settings (%d)\n", + pr_warn("Unsupported NM flag settings (%d)\n", rr->u.NM.flags); break; } @@ -353,13 +353,13 @@ repeat: break; case SIG('E', 'R'): ISOFS_SB(inode->i_sb)->s_rock = 1; - printk(KERN_DEBUG "ISO 9660 Extensions: "); + pr_debug("ISO 9660 Extensions: "); { int p; for (p = 0; p < rr->u.ER.len_id; p++) - printk("%c", rr->u.ER.data[p]); + pr_warn("%c", rr->u.ER.data[p]); } - printk("\n"); + pr_warn("\n"); break; case SIG('P', 'X'): inode->i_mode = isonum_733(rr->u.PX.mode); @@ -450,8 +450,7 @@ repeat: inode->i_size += 1; break; default: - printk("Symlink component flag " - "not implemented\n"); + pr_warn("Symlink component flag not implemented\n"); } slen -= slp->len + 2; oldslp = slp; @@ -481,8 +480,7 @@ repeat: symlink_len = inode->i_size; break; case SIG('R', 'E'): - printk(KERN_WARNING "Attempt to read inode for " - "relocated directory\n"); + pr_warn("Attempt to read inode for relocated directory\n"); goto out; case SIG('C', 'L'): ISOFS_I(inode)->i_first_extent = @@ -518,9 +516,7 @@ repeat: int block_shift = isonum_711(&rr->u.ZF.parms[1]); if (block_shift > 17) { - printk(KERN_WARNING "isofs: " - "Can't handle ZF block " - "size of 2^%d\n", + pr_warn("Can't handle ZF block size of 2^%d\n", block_shift); } else { /* @@ -543,9 +539,7 @@ repeat: real_size); } } else { - printk(KERN_WARNING - "isofs: Unknown ZF compression " - "algorithm: %c%c\n", + pr_warn("Unknown ZF compression algorithm: %c%c\n", rr->u.ZF.algorithm[0], rr->u.ZF.algorithm[1]); } @@ -604,7 +598,7 @@ static char *get_symlink_chunk(char *rpnt, struct rock_ridge *rr, char *plimit) *rpnt++ = '/'; break; default: - printk("Symlink component flag not implemented (%d)\n", + pr_warn("Symlink component flag not implemented (%d)\n", slp->flags); } slen -= slp->len + 2; @@ -757,10 +751,10 @@ out: kfree(rs.buffer); goto fail; out_noread: - printk("unable to read i-node block"); + pr_warn("unable to read i-node block"); goto fail; out_bad_span: - printk("symlink spans iso9660 blocks\n"); + pr_warn("symlink spans iso9660 blocks\n"); fail: brelse(bh); error: diff --git a/fs/jffs2/compr_zlib.c b/fs/jffs2/compr_zlib.c index 0b9a1e44e833..5698dae5d92d 100644 --- a/fs/jffs2/compr_zlib.c +++ b/fs/jffs2/compr_zlib.c @@ -94,11 +94,12 @@ static int jffs2_zlib_compress(unsigned char *data_in, while (def_strm.total_out < *dstlen - STREAM_END_SPACE && def_strm.total_in < *sourcelen) { def_strm.avail_out = *dstlen - (def_strm.total_out + STREAM_END_SPACE); - def_strm.avail_in = min((unsigned)(*sourcelen-def_strm.total_in), def_strm.avail_out); - jffs2_dbg(1, "calling deflate with avail_in %d, avail_out %d\n", + def_strm.avail_in = min_t(unsigned long, + (*sourcelen-def_strm.total_in), def_strm.avail_out); + jffs2_dbg(1, "calling deflate with avail_in %ld, avail_out %ld\n", def_strm.avail_in, def_strm.avail_out); ret = zlib_deflate(&def_strm, Z_PARTIAL_FLUSH); - jffs2_dbg(1, "deflate returned with avail_in %d, avail_out %d, total_in %ld, total_out %ld\n", + jffs2_dbg(1, "deflate returned with avail_in %ld, avail_out %ld, total_in %ld, total_out %ld\n", def_strm.avail_in, def_strm.avail_out, def_strm.total_in, def_strm.total_out); if (ret != Z_OK) { diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 48140315f627..380d86e1ab45 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1019,11 +1019,11 @@ static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs) /** * logfs_is_valid_block - check whether this block is still valid * - * @sb - superblock - * @ofs - block physical offset - * @ino - block inode number - * @bix - block index - * @level - block level + * @sb: superblock + * @ofs: block physical offset + * @ino: block inode number + * @bix: block index + * @gc_level: block level * * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will * become invalid once the journal is written. @@ -2226,10 +2226,9 @@ void btree_write_block(struct logfs_block *block) * * @inode: parent inode (ifile or directory) * @buf: object to write (inode or dentry) - * @n: object size - * @_pos: object number (file position in blocks/objects) + * @count: object size + * @bix: block index * @flags: write flags - * @lock: 0 if write lock is already taken, 1 otherwise * @shadow_tree: shadow below this inode * * FIXME: All caller of this put a 200-300 byte variable on the stack, diff --git a/fs/mpage.c b/fs/mpage.c index 5f9ed622274f..003f6fe3cdb6 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -480,6 +480,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc, struct buffer_head map_bh; loff_t i_size = i_size_read(inode); int ret = 0; + int wr = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); if (page_has_buffers(page)) { struct buffer_head *head = page_buffers(page); @@ -588,7 +589,7 @@ page_is_mapped: * This page will go to BIO. Do we need to send this BIO off first? */ if (bio && mpd->last_block_in_bio != blocks[0] - 1) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); alloc_new: if (bio == NULL) { @@ -612,7 +613,7 @@ alloc_new: */ length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); goto alloc_new; } @@ -622,7 +623,7 @@ alloc_new: set_page_writeback(page); unlock_page(page); if (boundary || (first_unmapped != blocks_per_page)) { - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); @@ -634,7 +635,7 @@ alloc_new: confused: if (bio) - bio = mpage_bio_submit(WRITE, bio); + bio = mpage_bio_submit(wr, bio); if (mpd->use_writepage) { ret = mapping->a_ops->writepage(page, wbc); @@ -690,8 +691,11 @@ mpage_writepages(struct address_space *mapping, }; ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } } blk_finish_plug(&plug); return ret; @@ -708,8 +712,11 @@ int mpage_writepage(struct page *page, get_block_t get_block, .use_writepage = 0, }; int ret = __mpage_writepage(page, wbc, &mpd); - if (mpd.bio) - mpage_bio_submit(WRITE, mpd.bio); + if (mpd.bio) { + int wr = (wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC : WRITE); + mpage_bio_submit(wr, mpd.bio); + } return ret; } EXPORT_SYMBOL(mpage_writepage); diff --git a/fs/namespace.c b/fs/namespace.c index b10db3d69943..019ff81536a2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -845,7 +845,7 @@ static void commit_tree(struct mount *mnt, struct mount *shadows) list_splice(&head, n->list.prev); if (shadows) - hlist_add_after_rcu(&shadows->mnt_hash, &mnt->mnt_hash); + hlist_add_behind_rcu(&mnt->mnt_hash, &shadows->mnt_hash); else hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile index 85c98737a146..fc603e0431bb 100644 --- a/fs/nilfs2/Makefile +++ b/fs/nilfs2/Makefile @@ -2,4 +2,4 @@ obj-$(CONFIG_NILFS2_FS) += nilfs2.o nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \ btnode.o bmap.o btree.o direct.o dat.o recovery.o \ the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \ - ifile.o alloc.o gcinode.o ioctl.o + ifile.o alloc.o gcinode.o ioctl.o sysfs.o diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 9bc72dec3fa6..0696161bf59d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -320,6 +320,14 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *); int nilfs_init_gcinode(struct inode *inode); void nilfs_remove_all_gcinodes(struct the_nilfs *nilfs); +/* sysfs.c */ +int __init nilfs_sysfs_init(void); +void nilfs_sysfs_exit(void); +int nilfs_sysfs_create_device_group(struct super_block *); +void nilfs_sysfs_delete_device_group(struct the_nilfs *); +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *); +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *); + /* * Inodes and files operations */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8c532b2ca3ab..48c9ce8c983d 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1014,7 +1014,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) struct dentry *dentry; int ret; - if (cno < 0 || cno > nilfs->ns_cno) + if (cno > nilfs->ns_cno) return false; if (cno >= nilfs_last_cno(nilfs)) @@ -1452,13 +1452,19 @@ static int __init init_nilfs_fs(void) if (err) goto fail; - err = register_filesystem(&nilfs_fs_type); + err = nilfs_sysfs_init(); if (err) goto free_cachep; + err = register_filesystem(&nilfs_fs_type); + if (err) + goto deinit_sysfs_entry; + printk(KERN_INFO "NILFS version 2 loaded\n"); return 0; +deinit_sysfs_entry: + nilfs_sysfs_exit(); free_cachep: nilfs_destroy_cachep(); fail: @@ -1468,6 +1474,7 @@ fail: static void __exit exit_nilfs_fs(void) { nilfs_destroy_cachep(); + nilfs_sysfs_exit(); unregister_filesystem(&nilfs_fs_type); } diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c new file mode 100644 index 000000000000..bbb0dcc35905 --- /dev/null +++ b/fs/nilfs2/sysfs.c @@ -0,0 +1,1137 @@ +/* + * sysfs.c - sysfs support implementation. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#include <linux/kobject.h> + +#include "nilfs.h" +#include "mdt.h" +#include "sufile.h" +#include "cpfile.h" +#include "sysfs.h" + +/* /sys/fs/<nilfs>/ */ +static struct kset *nilfs_kset; + +#define NILFS_SHOW_TIME(time_t_val, buf) ({ \ + struct tm res; \ + int count = 0; \ + time_to_tm(time_t_val, 0, &res); \ + res.tm_year += 1900; \ + res.tm_mon += 1; \ + count = scnprintf(buf, PAGE_SIZE, \ + "%ld-%.2d-%.2d %.2d:%.2d:%.2d\n", \ + res.tm_year, res.tm_mon, res.tm_mday, \ + res.tm_hour, res.tm_min, res.tm_sec);\ + count; \ +}) + +#define NILFS_DEV_INT_GROUP_OPS(name, parent_name) \ +static ssize_t nilfs_##name##_attr_show(struct kobject *kobj, \ + struct attribute *attr, char *buf) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->show ? a->show(a, nilfs, buf) : 0; \ +} \ +static ssize_t nilfs_##name##_attr_store(struct kobject *kobj, \ + struct attribute *attr, \ + const char *buf, size_t len) \ +{ \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + struct nilfs_##name##_attr *a = container_of(attr, \ + struct nilfs_##name##_attr, \ + attr); \ + return a->store ? a->store(a, nilfs, buf, len) : 0; \ +} \ +static const struct sysfs_ops nilfs_##name##_attr_ops = { \ + .show = nilfs_##name##_attr_show, \ + .store = nilfs_##name##_attr_store, \ +}; + +#define NILFS_DEV_INT_GROUP_TYPE(name, parent_name) \ +static void nilfs_##name##_attr_release(struct kobject *kobj) \ +{ \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + struct the_nilfs *nilfs = container_of(kobj->parent, \ + struct the_nilfs, \ + ns_##parent_name##_kobj); \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + complete(&subgroups->sg_##name##_kobj_unregister); \ +} \ +static struct kobj_type nilfs_##name##_ktype = { \ + .default_attrs = nilfs_##name##_attrs, \ + .sysfs_ops = &nilfs_##name##_attr_ops, \ + .release = nilfs_##name##_attr_release, \ +}; + +#define NILFS_DEV_INT_GROUP_FNS(name, parent_name) \ +static int nilfs_sysfs_create_##name##_group(struct the_nilfs *nilfs) \ +{ \ + struct kobject *parent; \ + struct kobject *kobj; \ + struct completion *kobj_unregister; \ + struct nilfs_sysfs_##parent_name##_subgroups *subgroups; \ + int err; \ + subgroups = nilfs->ns_##parent_name##_subgroups; \ + kobj = &subgroups->sg_##name##_kobj; \ + kobj_unregister = &subgroups->sg_##name##_kobj_unregister; \ + parent = &nilfs->ns_##parent_name##_kobj; \ + kobj->kset = nilfs_kset; \ + init_completion(kobj_unregister); \ + err = kobject_init_and_add(kobj, &nilfs_##name##_ktype, parent, \ + #name); \ + if (err) \ + return err; \ + return 0; \ +} \ +static void nilfs_sysfs_delete_##name##_group(struct the_nilfs *nilfs) \ +{ \ + kobject_del(&nilfs->ns_##parent_name##_subgroups->sg_##name##_kobj); \ +} + +/************************************************************************ + * NILFS snapshot attrs * + ************************************************************************/ + +static ssize_t +nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->inodes_count)); +} + +static ssize_t +nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)atomic64_read(&root->blocks_count)); +} + +static const char snapshot_readme_str[] = + "The group contains details about mounted snapshot.\n\n" + "(1) inodes_count\n\tshow number of inodes for snapshot.\n\n" + "(2) blocks_count\n\tshow number of blocks for snapshot.\n\n"; + +static ssize_t +nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr, + struct nilfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, snapshot_readme_str); +} + +NILFS_SNAPSHOT_RO_ATTR(inodes_count); +NILFS_SNAPSHOT_RO_ATTR(blocks_count); +NILFS_SNAPSHOT_RO_ATTR(README); + +static struct attribute *nilfs_snapshot_attrs[] = { + NILFS_SNAPSHOT_ATTR_LIST(inodes_count), + NILFS_SNAPSHOT_ATTR_LIST(blocks_count), + NILFS_SNAPSHOT_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_snapshot_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->show ? a->show(a, root, buf) : 0; +} + +static ssize_t nilfs_snapshot_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct nilfs_root *root = + container_of(kobj, struct nilfs_root, snapshot_kobj); + struct nilfs_snapshot_attr *a = + container_of(attr, struct nilfs_snapshot_attr, attr); + + return a->store ? a->store(a, root, buf, len) : 0; +} + +static void nilfs_snapshot_attr_release(struct kobject *kobj) +{ + struct nilfs_root *root = container_of(kobj, struct nilfs_root, + snapshot_kobj); + complete(&root->snapshot_kobj_unregister); +} + +static const struct sysfs_ops nilfs_snapshot_attr_ops = { + .show = nilfs_snapshot_attr_show, + .store = nilfs_snapshot_attr_store, +}; + +static struct kobj_type nilfs_snapshot_ktype = { + .default_attrs = nilfs_snapshot_attrs, + .sysfs_ops = &nilfs_snapshot_attr_ops, + .release = nilfs_snapshot_attr_release, +}; + +int nilfs_sysfs_create_snapshot_group(struct nilfs_root *root) +{ + struct the_nilfs *nilfs; + struct kobject *parent; + int err; + + nilfs = root->nilfs; + parent = &nilfs->ns_dev_subgroups->sg_mounted_snapshots_kobj; + root->snapshot_kobj.kset = nilfs_kset; + init_completion(&root->snapshot_kobj_unregister); + + if (root->cno == NILFS_CPTREE_CURRENT_CNO) { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + &nilfs->ns_dev_kobj, + "current_checkpoint"); + } else { + err = kobject_init_and_add(&root->snapshot_kobj, + &nilfs_snapshot_ktype, + parent, + "%llu", root->cno); + } + + if (err) + return err; + + return 0; +} + +void nilfs_sysfs_delete_snapshot_group(struct nilfs_root *root) +{ + kobject_del(&root->snapshot_kobj); +} + +/************************************************************************ + * NILFS mounted snapshots attrs * + ************************************************************************/ + +static const char mounted_snapshots_readme_str[] = + "The mounted_snapshots group contains group for\n" + "every mounted snapshot.\n"; + +static ssize_t +nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str); +} + +NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README); + +static struct attribute *nilfs_mounted_snapshots_attrs[] = { + NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_TYPE(mounted_snapshots, dev); +NILFS_DEV_INT_GROUP_FNS(mounted_snapshots, dev); + +/************************************************************************ + * NILFS checkpoints attrs * + ************************************************************************/ + +static ssize_t +nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 ncheckpoints; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + ncheckpoints = cpstat.cs_ncps; + + return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints); +} + +static ssize_t +nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nsnapshots; + struct nilfs_cpstat cpstat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get checkpoint stat: err=%d\n", + err); + return err; + } + + nsnapshots = cpstat.cs_nsss; + + return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots); +} + +static ssize_t +nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static const char checkpoints_readme_str[] = + "The checkpoints group contains attributes that describe\n" + "details about volume's checkpoints.\n\n" + "(1) checkpoints_number\n\tshow number of checkpoints on volume.\n\n" + "(2) snapshots_number\n\tshow number of snapshots on volume.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) next_checkpoint\n\tshow next checkpoint number.\n\n"; + +static ssize_t +nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, checkpoints_readme_str); +} + +NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number); +NILFS_CHECKPOINTS_RO_ATTR(snapshots_number); +NILFS_CHECKPOINTS_RO_ATTR(last_seg_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(next_checkpoint); +NILFS_CHECKPOINTS_RO_ATTR(README); + +static struct attribute *nilfs_checkpoints_attrs[] = { + NILFS_CHECKPOINTS_ATTR_LIST(checkpoints_number), + NILFS_CHECKPOINTS_ATTR_LIST(snapshots_number), + NILFS_CHECKPOINTS_ATTR_LIST(last_seg_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(next_checkpoint), + NILFS_CHECKPOINTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(checkpoints, dev); +NILFS_DEV_INT_GROUP_TYPE(checkpoints, dev); +NILFS_DEV_INT_GROUP_FNS(checkpoints, dev); + +/************************************************************************ + * NILFS segments attrs * + ************************************************************************/ + +static ssize_t +nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments); +} + +static ssize_t +nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment); +} + +static ssize_t +nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long ncleansegs; + + down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); + up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs); +} + +static ssize_t +nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_sustat sustat; + int err; + + down_read(&nilfs->ns_segctor_sem); + err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat); + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + printk(KERN_ERR "NILFS: unable to get segment stat: err=%d\n", + err); + return err; + } + + return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs); +} + +static const char segments_readme_str[] = + "The segments group contains attributes that describe\n" + "details about volume's segments.\n\n" + "(1) segments_number\n\tshow number of segments on volume.\n\n" + "(2) blocks_per_segment\n\tshow number of blocks in segment.\n\n" + "(3) clean_segments\n\tshow count of clean segments.\n\n" + "(4) dirty_segments\n\tshow count of dirty segments.\n\n"; + +static ssize_t +nilfs_segments_README_show(struct nilfs_segments_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, segments_readme_str); +} + +NILFS_SEGMENTS_RO_ATTR(segments_number); +NILFS_SEGMENTS_RO_ATTR(blocks_per_segment); +NILFS_SEGMENTS_RO_ATTR(clean_segments); +NILFS_SEGMENTS_RO_ATTR(dirty_segments); +NILFS_SEGMENTS_RO_ATTR(README); + +static struct attribute *nilfs_segments_attrs[] = { + NILFS_SEGMENTS_ATTR_LIST(segments_number), + NILFS_SEGMENTS_ATTR_LIST(blocks_per_segment), + NILFS_SEGMENTS_ATTR_LIST(clean_segments), + NILFS_SEGMENTS_ATTR_LIST(dirty_segments), + NILFS_SEGMENTS_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segments, dev); +NILFS_DEV_INT_GROUP_TYPE(segments, dev); +NILFS_DEV_INT_GROUP_FNS(segments, dev); + +/************************************************************************ + * NILFS segctor attrs * + ************************************************************************/ + +static ssize_t +nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t last_pseg; + + spin_lock(&nilfs->ns_last_segment_lock); + last_pseg = nilfs->ns_last_pseg; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)last_pseg); +} + +static ssize_t +nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 last_seq; + + spin_lock(&nilfs->ns_last_segment_lock); + last_seq = nilfs->ns_last_seq; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq); +} + +static ssize_t +nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 last_cno; + + spin_lock(&nilfs->ns_last_segment_lock); + last_cno = nilfs->ns_last_cno; + spin_unlock(&nilfs->ns_last_segment_lock); + + return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); +} + +static ssize_t +nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u64 seg_seq; + + down_read(&nilfs->ns_sem); + seg_seq = nilfs->ns_seg_seq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); +} + +static ssize_t +nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 segnum; + + down_read(&nilfs->ns_sem); + segnum = nilfs->ns_segnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); +} + +static ssize_t +nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 nextnum; + + down_read(&nilfs->ns_sem); + nextnum = nilfs->ns_nextnum; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); +} + +static ssize_t +nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned long pseg_offset; + + down_read(&nilfs->ns_sem); + pseg_offset = nilfs->ns_pseg_offset; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); +} + +static ssize_t +nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + __u64 cno; + + down_read(&nilfs->ns_sem); + cno = nilfs->ns_cno; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", cno); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(ctime, buf); +} + +static ssize_t +nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t ctime; + + down_read(&nilfs->ns_sem); + ctime = nilfs->ns_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)ctime); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(nongc_ctime, buf); +} + +static ssize_t +nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t nongc_ctime; + + down_read(&nilfs->ns_sem); + nongc_ctime = nilfs->ns_nongc_ctime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)nongc_ctime); +} + +static ssize_t +nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + u32 ndirtyblks; + + down_read(&nilfs->ns_sem); + ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); +} + +static const char segctor_readme_str[] = + "The segctor group contains attributes that describe\n" + "segctor thread activity details.\n\n" + "(1) last_pseg_block\n" + "\tshow start block number of the latest segment.\n\n" + "(2) last_seg_sequence\n" + "\tshow sequence value of the latest segment.\n\n" + "(3) last_seg_checkpoint\n" + "\tshow checkpoint number of the latest segment.\n\n" + "(4) current_seg_sequence\n\tshow segment sequence counter.\n\n" + "(5) current_last_full_seg\n" + "\tshow index number of the latest full segment.\n\n" + "(6) next_full_seg\n" + "\tshow index number of the full segment index to be used next.\n\n" + "(7) next_pseg_offset\n" + "\tshow offset of next partial segment in the current full segment.\n\n" + "(8) next_checkpoint\n\tshow next checkpoint number.\n\n" + "(9) last_seg_write_time\n" + "\tshow write time of the last segment in human-readable format.\n\n" + "(10) last_seg_write_time_secs\n" + "\tshow write time of the last segment in seconds.\n\n" + "(11) last_nongc_write_time\n" + "\tshow write time of the last segment not for cleaner operation " + "in human-readable format.\n\n" + "(12) last_nongc_write_time_secs\n" + "\tshow write time of the last segment not for cleaner operation " + "in seconds.\n\n" + "(13) dirty_data_blocks_count\n" + "\tshow number of dirty data blocks.\n\n"; + +static ssize_t +nilfs_segctor_README_show(struct nilfs_segctor_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, segctor_readme_str); +} + +NILFS_SEGCTOR_RO_ATTR(last_pseg_block); +NILFS_SEGCTOR_RO_ATTR(last_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(last_seg_checkpoint); +NILFS_SEGCTOR_RO_ATTR(current_seg_sequence); +NILFS_SEGCTOR_RO_ATTR(current_last_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_full_seg); +NILFS_SEGCTOR_RO_ATTR(next_pseg_offset); +NILFS_SEGCTOR_RO_ATTR(next_checkpoint); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time); +NILFS_SEGCTOR_RO_ATTR(last_seg_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time); +NILFS_SEGCTOR_RO_ATTR(last_nongc_write_time_secs); +NILFS_SEGCTOR_RO_ATTR(dirty_data_blocks_count); +NILFS_SEGCTOR_RO_ATTR(README); + +static struct attribute *nilfs_segctor_attrs[] = { + NILFS_SEGCTOR_ATTR_LIST(last_pseg_block), + NILFS_SEGCTOR_ATTR_LIST(last_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(last_seg_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(current_seg_sequence), + NILFS_SEGCTOR_ATTR_LIST(current_last_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_full_seg), + NILFS_SEGCTOR_ATTR_LIST(next_pseg_offset), + NILFS_SEGCTOR_ATTR_LIST(next_checkpoint), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_seg_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time), + NILFS_SEGCTOR_ATTR_LIST(last_nongc_write_time_secs), + NILFS_SEGCTOR_ATTR_LIST(dirty_data_blocks_count), + NILFS_SEGCTOR_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(segctor, dev); +NILFS_DEV_INT_GROUP_TYPE(segctor, dev); +NILFS_DEV_INT_GROUP_FNS(segctor, dev); + +/************************************************************************ + * NILFS superblock attrs * + ************************************************************************/ + +static ssize_t +nilfs_superblock_sb_write_time_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return NILFS_SHOW_TIME(sbwtime, buf); +} + +static ssize_t +nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + time_t sbwtime; + + down_read(&nilfs->ns_sem); + sbwtime = nilfs->ns_sbwtime; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)sbwtime); +} + +static ssize_t +nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sbwcount; + + down_read(&nilfs->ns_sem); + sbwcount = nilfs->ns_sbwcount; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + unsigned sb_update_freq; + + down_read(&nilfs->ns_sem); + sb_update_freq = nilfs->ns_sb_update_freq; + up_read(&nilfs->ns_sem); + + return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq); +} + +static ssize_t +nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, + const char *buf, size_t count) +{ + unsigned val; + int err; + + err = kstrtouint(skip_spaces(buf), 0, &val); + if (err) { + printk(KERN_ERR "NILFS: unable to convert string: err=%d\n", + err); + return err; + } + + if (val < NILFS_SB_FREQ) { + val = NILFS_SB_FREQ; + printk(KERN_WARNING "NILFS: superblock update frequency cannot be lesser than 10 seconds\n"); + } + + down_write(&nilfs->ns_sem); + nilfs->ns_sb_update_freq = val; + up_write(&nilfs->ns_sem); + + return count; +} + +static const char sb_readme_str[] = + "The superblock group contains attributes that describe\n" + "superblock's details.\n\n" + "(1) sb_write_time\n\tshow previous write time of super block " + "in human-readable format.\n\n" + "(2) sb_write_time_secs\n\tshow previous write time of super block " + "in seconds.\n\n" + "(3) sb_write_count\n\tshow write count of super block.\n\n" + "(4) sb_update_frequency\n" + "\tshow/set interval of periodical update of superblock (in seconds).\n\n" + "\tYou can set preferable frequency of superblock update by command:\n\n" + "\t'echo <val> > /sys/fs/<nilfs>/<dev>/superblock/sb_update_frequency'\n"; + +static ssize_t +nilfs_superblock_README_show(struct nilfs_superblock_attr *attr, + struct the_nilfs *nilfs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, sb_readme_str); +} + +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_time_secs); +NILFS_SUPERBLOCK_RO_ATTR(sb_write_count); +NILFS_SUPERBLOCK_RW_ATTR(sb_update_frequency); +NILFS_SUPERBLOCK_RO_ATTR(README); + +static struct attribute *nilfs_superblock_attrs[] = { + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_time_secs), + NILFS_SUPERBLOCK_ATTR_LIST(sb_write_count), + NILFS_SUPERBLOCK_ATTR_LIST(sb_update_frequency), + NILFS_SUPERBLOCK_ATTR_LIST(README), + NULL, +}; + +NILFS_DEV_INT_GROUP_OPS(superblock, dev); +NILFS_DEV_INT_GROUP_TYPE(superblock, dev); +NILFS_DEV_INT_GROUP_FNS(superblock, dev); + +/************************************************************************ + * NILFS device attrs * + ************************************************************************/ + +static +ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u32 major = le32_to_cpu(sbp[0]->s_rev_level); + u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); + + return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor); +} + +static +ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize); +} + +static +ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); + + return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size); +} + +static +ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + sector_t free_blocks = 0; + + nilfs_count_free_blocks(nilfs, &free_blocks); + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)free_blocks); +} + +static +ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid); +} + +static +ssize_t nilfs_dev_volume_name_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + struct nilfs_super_block **sbp = nilfs->ns_sbp; + + return scnprintf(buf, sizeof(sbp[0]->s_volume_name), "%s\n", + sbp[0]->s_volume_name); +} + +static const char dev_readme_str[] = + "The <device> group contains attributes that describe file system\n" + "partition's details.\n\n" + "(1) revision\n\tshow NILFS file system revision.\n\n" + "(2) blocksize\n\tshow volume block size in bytes.\n\n" + "(3) device_size\n\tshow volume size in bytes.\n\n" + "(4) free_blocks\n\tshow count of free blocks on volume.\n\n" + "(5) uuid\n\tshow volume's UUID.\n\n" + "(6) volume_name\n\tshow volume's name.\n\n"; + +static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr, + struct the_nilfs *nilfs, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, dev_readme_str); +} + +NILFS_DEV_RO_ATTR(revision); +NILFS_DEV_RO_ATTR(blocksize); +NILFS_DEV_RO_ATTR(device_size); +NILFS_DEV_RO_ATTR(free_blocks); +NILFS_DEV_RO_ATTR(uuid); +NILFS_DEV_RO_ATTR(volume_name); +NILFS_DEV_RO_ATTR(README); + +static struct attribute *nilfs_dev_attrs[] = { + NILFS_DEV_ATTR_LIST(revision), + NILFS_DEV_ATTR_LIST(blocksize), + NILFS_DEV_ATTR_LIST(device_size), + NILFS_DEV_ATTR_LIST(free_blocks), + NILFS_DEV_ATTR_LIST(uuid), + NILFS_DEV_ATTR_LIST(volume_name), + NILFS_DEV_ATTR_LIST(README), + NULL, +}; + +static ssize_t nilfs_dev_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->show ? a->show(a, nilfs, buf) : 0; +} + +static ssize_t nilfs_dev_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + struct nilfs_dev_attr *a = container_of(attr, struct nilfs_dev_attr, + attr); + + return a->store ? a->store(a, nilfs, buf, len) : 0; +} + +static void nilfs_dev_attr_release(struct kobject *kobj) +{ + struct the_nilfs *nilfs = container_of(kobj, struct the_nilfs, + ns_dev_kobj); + complete(&nilfs->ns_dev_kobj_unregister); +} + +static const struct sysfs_ops nilfs_dev_attr_ops = { + .show = nilfs_dev_attr_show, + .store = nilfs_dev_attr_store, +}; + +static struct kobj_type nilfs_dev_ktype = { + .default_attrs = nilfs_dev_attrs, + .sysfs_ops = &nilfs_dev_attr_ops, + .release = nilfs_dev_attr_release, +}; + +int nilfs_sysfs_create_device_group(struct super_block *sb) +{ + struct the_nilfs *nilfs = sb->s_fs_info; + size_t devgrp_size = sizeof(struct nilfs_sysfs_dev_subgroups); + int err; + + nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL); + if (unlikely(!nilfs->ns_dev_subgroups)) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to allocate memory for device group\n"); + goto failed_create_device_group; + } + + nilfs->ns_dev_kobj.kset = nilfs_kset; + init_completion(&nilfs->ns_dev_kobj_unregister); + err = kobject_init_and_add(&nilfs->ns_dev_kobj, &nilfs_dev_ktype, NULL, + "%s", sb->s_id); + if (err) + goto free_dev_subgroups; + + err = nilfs_sysfs_create_mounted_snapshots_group(nilfs); + if (err) + goto cleanup_dev_kobject; + + err = nilfs_sysfs_create_checkpoints_group(nilfs); + if (err) + goto delete_mounted_snapshots_group; + + err = nilfs_sysfs_create_segments_group(nilfs); + if (err) + goto delete_checkpoints_group; + + err = nilfs_sysfs_create_superblock_group(nilfs); + if (err) + goto delete_segments_group; + + err = nilfs_sysfs_create_segctor_group(nilfs); + if (err) + goto delete_superblock_group; + + return 0; + +delete_superblock_group: + nilfs_sysfs_delete_superblock_group(nilfs); + +delete_segments_group: + nilfs_sysfs_delete_segments_group(nilfs); + +delete_checkpoints_group: + nilfs_sysfs_delete_checkpoints_group(nilfs); + +delete_mounted_snapshots_group: + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + +cleanup_dev_kobject: + kobject_del(&nilfs->ns_dev_kobj); + +free_dev_subgroups: + kfree(nilfs->ns_dev_subgroups); + +failed_create_device_group: + return err; +} + +void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) +{ + nilfs_sysfs_delete_mounted_snapshots_group(nilfs); + nilfs_sysfs_delete_checkpoints_group(nilfs); + nilfs_sysfs_delete_segments_group(nilfs); + nilfs_sysfs_delete_superblock_group(nilfs); + nilfs_sysfs_delete_segctor_group(nilfs); + kobject_del(&nilfs->ns_dev_kobj); + kfree(nilfs->ns_dev_subgroups); +} + +/************************************************************************ + * NILFS feature attrs * + ************************************************************************/ + +static ssize_t nilfs_feature_revision_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d.%d\n", + NILFS_CURRENT_REV, NILFS_MINOR_REV); +} + +static const char features_readme_str[] = + "The features group contains attributes that describe NILFS file\n" + "system driver features.\n\n" + "(1) revision\n\tshow current revision of NILFS file system driver.\n"; + +static ssize_t nilfs_feature_README_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, features_readme_str); +} + +NILFS_FEATURE_RO_ATTR(revision); +NILFS_FEATURE_RO_ATTR(README); + +static struct attribute *nilfs_feature_attrs[] = { + NILFS_FEATURE_ATTR_LIST(revision), + NILFS_FEATURE_ATTR_LIST(README), + NULL, +}; + +static const struct attribute_group nilfs_feature_attr_group = { + .name = "features", + .attrs = nilfs_feature_attrs, +}; + +int __init nilfs_sysfs_init(void) +{ + int err; + + nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj); + if (!nilfs_kset) { + err = -ENOMEM; + printk(KERN_ERR "NILFS: unable to create sysfs entry: err %d\n", + err); + goto failed_sysfs_init; + } + + err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + if (unlikely(err)) { + printk(KERN_ERR "NILFS: unable to create feature group: err %d\n", + err); + goto cleanup_sysfs_init; + } + + return 0; + +cleanup_sysfs_init: + kset_unregister(nilfs_kset); + +failed_sysfs_init: + return err; +} + +void nilfs_sysfs_exit(void) +{ + sysfs_remove_group(&nilfs_kset->kobj, &nilfs_feature_attr_group); + kset_unregister(nilfs_kset); +} diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h new file mode 100644 index 000000000000..677e3a1a8370 --- /dev/null +++ b/fs/nilfs2/sysfs.h @@ -0,0 +1,176 @@ +/* + * sysfs.h - sysfs support declarations. + * + * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. + * Copyright (C) 2014 HGST, Inc., a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Written by Vyacheslav Dubeyko <Vyacheslav.Dubeyko@hgst.com> + */ + +#ifndef _NILFS_SYSFS_H +#define _NILFS_SYSFS_H + +#include <linux/sysfs.h> + +#define NILFS_ROOT_GROUP_NAME "nilfs2" + +/* + * struct nilfs_sysfs_dev_subgroups - device subgroup kernel objects + * @sg_superblock_kobj: /sys/fs/<nilfs>/<device>/superblock + * @sg_superblock_kobj_unregister: completion state + * @sg_segctor_kobj: /sys/fs/<nilfs>/<device>/segctor + * @sg_segctor_kobj_unregister: completion state + * @sg_mounted_snapshots_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots + * @sg_mounted_snapshots_kobj_unregister: completion state + * @sg_checkpoints_kobj: /sys/fs/<nilfs>/<device>/checkpoints + * @sg_checkpoints_kobj_unregister: completion state + * @sg_segments_kobj: /sys/fs/<nilfs>/<device>/segments + * @sg_segments_kobj_unregister: completion state + */ +struct nilfs_sysfs_dev_subgroups { + /* /sys/fs/<nilfs>/<device>/superblock */ + struct kobject sg_superblock_kobj; + struct completion sg_superblock_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segctor */ + struct kobject sg_segctor_kobj; + struct completion sg_segctor_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots */ + struct kobject sg_mounted_snapshots_kobj; + struct completion sg_mounted_snapshots_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/checkpoints */ + struct kobject sg_checkpoints_kobj; + struct completion sg_checkpoints_kobj_unregister; + + /* /sys/fs/<nilfs>/<device>/segments */ + struct kobject sg_segments_kobj; + struct completion sg_segments_kobj_unregister; +}; + +#define NILFS_COMMON_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct kobject *, struct attribute *, \ + char *); \ + ssize_t (*store)(struct kobject *, struct attribute *, \ + const char *, size_t); \ +}; + +NILFS_COMMON_ATTR_STRUCT(feature); + +#define NILFS_DEV_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct the_nilfs *, \ + const char *, size_t); \ +}; + +NILFS_DEV_ATTR_STRUCT(dev); +NILFS_DEV_ATTR_STRUCT(segments); +NILFS_DEV_ATTR_STRUCT(mounted_snapshots); +NILFS_DEV_ATTR_STRUCT(checkpoints); +NILFS_DEV_ATTR_STRUCT(superblock); +NILFS_DEV_ATTR_STRUCT(segctor); + +#define NILFS_CP_ATTR_STRUCT(name) \ +struct nilfs_##name##_attr { \ + struct attribute attr; \ + ssize_t (*show)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + char *); \ + ssize_t (*store)(struct nilfs_##name##_attr *, struct nilfs_root *, \ + const char *, size_t); \ +}; + +NILFS_CP_ATTR_STRUCT(snapshot); + +#define NILFS_ATTR(type, name, mode, show, store) \ + static struct nilfs_##type##_attr nilfs_##type##_attr_##name = \ + __ATTR(name, mode, show, store) + +#define NILFS_INFO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, NULL, NULL) +#define NILFS_RO_ATTR(type, name) \ + NILFS_ATTR(type, name, 0444, nilfs_##type##_##name##_show, NULL) +#define NILFS_RW_ATTR(type, name) \ + NILFS_ATTR(type, name, 0644, \ + nilfs_##type##_##name##_show, \ + nilfs_##type##_##name##_store) + +#define NILFS_FEATURE_INFO_ATTR(name) \ + NILFS_INFO_ATTR(feature, name) +#define NILFS_FEATURE_RO_ATTR(name) \ + NILFS_RO_ATTR(feature, name) +#define NILFS_FEATURE_RW_ATTR(name) \ + NILFS_RW_ATTR(feature, name) + +#define NILFS_DEV_INFO_ATTR(name) \ + NILFS_INFO_ATTR(dev, name) +#define NILFS_DEV_RO_ATTR(name) \ + NILFS_RO_ATTR(dev, name) +#define NILFS_DEV_RW_ATTR(name) \ + NILFS_RW_ATTR(dev, name) + +#define NILFS_SEGMENTS_RO_ATTR(name) \ + NILFS_RO_ATTR(segments, name) +#define NILFS_SEGMENTS_RW_ATTR(name) \ + NILFS_RW_ATTR(segs_info, name) + +#define NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(name) \ + NILFS_RO_ATTR(mounted_snapshots, name) + +#define NILFS_CHECKPOINTS_RO_ATTR(name) \ + NILFS_RO_ATTR(checkpoints, name) +#define NILFS_CHECKPOINTS_RW_ATTR(name) \ + NILFS_RW_ATTR(checkpoints, name) + +#define NILFS_SNAPSHOT_INFO_ATTR(name) \ + NILFS_INFO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RO_ATTR(name) \ + NILFS_RO_ATTR(snapshot, name) +#define NILFS_SNAPSHOT_RW_ATTR(name) \ + NILFS_RW_ATTR(snapshot, name) + +#define NILFS_SUPERBLOCK_RO_ATTR(name) \ + NILFS_RO_ATTR(superblock, name) +#define NILFS_SUPERBLOCK_RW_ATTR(name) \ + NILFS_RW_ATTR(superblock, name) + +#define NILFS_SEGCTOR_INFO_ATTR(name) \ + NILFS_INFO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RO_ATTR(name) \ + NILFS_RO_ATTR(segctor, name) +#define NILFS_SEGCTOR_RW_ATTR(name) \ + NILFS_RW_ATTR(segctor, name) + +#define NILFS_FEATURE_ATTR_LIST(name) \ + (&nilfs_feature_attr_##name.attr) +#define NILFS_DEV_ATTR_LIST(name) \ + (&nilfs_dev_attr_##name.attr) +#define NILFS_SEGMENTS_ATTR_LIST(name) \ + (&nilfs_segments_attr_##name.attr) +#define NILFS_MOUNTED_SNAPSHOTS_ATTR_LIST(name) \ + (&nilfs_mounted_snapshots_attr_##name.attr) +#define NILFS_CHECKPOINTS_ATTR_LIST(name) \ + (&nilfs_checkpoints_attr_##name.attr) +#define NILFS_SNAPSHOT_ATTR_LIST(name) \ + (&nilfs_snapshot_attr_##name.attr) +#define NILFS_SUPERBLOCK_ATTR_LIST(name) \ + (&nilfs_superblock_attr_##name.attr) +#define NILFS_SEGCTOR_ATTR_LIST(name) \ + (&nilfs_segctor_attr_##name.attr) + +#endif /* _NILFS_SYSFS_H */ diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8ba8229ba076..9da25fe9ea61 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -85,6 +85,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_cptree = RB_ROOT; spin_lock_init(&nilfs->ns_cptree_lock); init_rwsem(&nilfs->ns_segctor_sem); + nilfs->ns_sb_update_freq = NILFS_SB_FREQ; return nilfs; } @@ -97,6 +98,7 @@ void destroy_nilfs(struct the_nilfs *nilfs) { might_sleep(); if (nilfs_init(nilfs)) { + nilfs_sysfs_delete_device_group(nilfs); brelse(nilfs->ns_sbh[0]); brelse(nilfs->ns_sbh[1]); } @@ -640,6 +642,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data) if (err) goto failed_sbh; + err = nilfs_sysfs_create_device_group(sb); + if (err) + goto failed_sbh; + set_nilfs_init(nilfs); err = 0; out: @@ -740,12 +746,13 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) { struct rb_node **p, *parent; struct nilfs_root *root, *new; + int err; root = nilfs_lookup_root(nilfs, cno); if (root) return root; - new = kmalloc(sizeof(*root), GFP_KERNEL); + new = kzalloc(sizeof(*root), GFP_KERNEL); if (!new) return NULL; @@ -782,6 +789,12 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) spin_unlock(&nilfs->ns_cptree_lock); + err = nilfs_sysfs_create_snapshot_group(new); + if (err) { + kfree(new); + new = NULL; + } + return new; } @@ -790,6 +803,8 @@ void nilfs_put_root(struct nilfs_root *root) if (atomic_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; + nilfs_sysfs_delete_snapshot_group(root); + spin_lock(&nilfs->ns_cptree_lock); rb_erase(&root->rb_node, &nilfs->ns_cptree); spin_unlock(&nilfs->ns_cptree_lock); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index de8cc53b4a5c..d01ead1bea9a 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -33,6 +33,7 @@ #include <linux/slab.h> struct nilfs_sc_info; +struct nilfs_sysfs_dev_subgroups; /* the_nilfs struct */ enum { @@ -54,6 +55,7 @@ enum { * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block * @ns_mount_state: file system state + * @ns_sb_update_freq: interval of periodical update of superblocks (in seconds) * @ns_seg_seq: segment sequence counter * @ns_segnum: index number of the latest full segment. * @ns_nextnum: index number of the full segment index to be used next @@ -95,6 +97,9 @@ enum { * @ns_inode_size: size of on-disk inode * @ns_first_ino: first not-special inode number * @ns_crc_seed: seed value of CRC32 calculation + * @ns_dev_kobj: /sys/fs/<nilfs>/<device> + * @ns_dev_kobj_unregister: completion state + * @ns_dev_subgroups: <device> subgroups pointer */ struct the_nilfs { unsigned long ns_flags; @@ -114,6 +119,7 @@ struct the_nilfs { unsigned ns_sbwcount; unsigned ns_sbsize; unsigned ns_mount_state; + unsigned ns_sb_update_freq; /* * Following fields are dedicated to a writable FS-instance. @@ -188,6 +194,11 @@ struct the_nilfs { int ns_inode_size; int ns_first_ino; u32 ns_crc_seed; + + /* /sys/fs/<nilfs>/<device> */ + struct kobject ns_dev_kobj; + struct completion ns_dev_kobj_unregister; + struct nilfs_sysfs_dev_subgroups *ns_dev_subgroups; }; #define THE_NILFS_FNS(bit, name) \ @@ -232,6 +243,8 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) * @ifile: inode file * @inodes_count: number of inodes * @blocks_count: number of blocks + * @snapshot_kobj: /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> + * @snapshot_kobj_unregister: completion state for kernel object */ struct nilfs_root { __u64 cno; @@ -243,6 +256,10 @@ struct nilfs_root { atomic64_t inodes_count; atomic64_t blocks_count; + + /* /sys/fs/<nilfs>/<device>/mounted_snapshots/<snapshot> */ + struct kobject snapshot_kobj; + struct completion snapshot_kobj_unregister; }; /* Special checkpoint number */ @@ -254,7 +271,8 @@ struct nilfs_root { static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); - return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; + return t < nilfs->ns_sbwtime || + t > nilfs->ns_sbwtime + nilfs->ns_sb_update_freq; } static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 74825be65b7b..9ce062218de9 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -232,7 +232,7 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->i.i_list, &mark->i.i_list); + hlist_add_behind_rcu(&mark->i.i_list, &last->i.i_list); out: fsnotify_recalc_inode_mask_locked(inode); spin_unlock(&inode->i_lock); diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c index 68ca5a8704b5..ac851e8376b1 100644 --- a/fs/notify/vfsmount_mark.c +++ b/fs/notify/vfsmount_mark.c @@ -191,7 +191,7 @@ int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ - hlist_add_after_rcu(&last->m.m_list, &mark->m.m_list); + hlist_add_behind_rcu(&mark->m.m_list, &last->m.m_list); out: fsnotify_recalc_vfsmount_mask_locked(mnt); spin_unlock(&mnt->mnt_root->d_lock); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 5c9e2c81cb11..f5ec1ce7a532 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -74,8 +74,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp) * ntfs_attr_extend_initialized - extend the initialized size of an attribute * @ni: ntfs inode of the attribute to extend * @new_init_size: requested new initialized size in bytes - * @cached_page: store any allocated but unused page here - * @lru_pvec: lru-buffering pagevec of the caller * * Extend the initialized size of an attribute described by the ntfs inode @ni * to @new_init_size bytes. This involves zeroing any non-sparse space between @@ -395,7 +393,6 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, * @nr_pages: number of page cache pages to obtain * @pages: array of pages in which to return the obtained page cache pages * @cached_page: allocated but as yet unused page - * @lru_pvec: lru-buffering pagevec of caller * * Obtain @nr_pages locked page cache pages from the mapping @mapping and * starting at index @index. diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9d8fcf2f3b94..a93bf9892256 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4961,6 +4961,15 @@ leftright: el = path_leaf_el(path); split_index = ocfs2_search_extent_list(el, cpos); + if (split_index == -1) { + ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), + "Owner %llu has an extent at cpos %u " + "which can no longer be found.\n", + (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), + cpos); + ret = -EROFS; + goto out; + } goto leftright; } out: @@ -5135,7 +5144,7 @@ int ocfs2_change_extent_flag(handle_t *handle, el = path_leaf_el(left_path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5491,7 +5500,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has an extent at cpos %u which can no " "longer be found.\n", @@ -5557,7 +5566,7 @@ int ocfs2_remove_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu: split at cpos %u lost record.", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4a231a166cf8..a06b23773384 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1817,16 +1817,6 @@ try_again: if (ret) goto out_commit; } - /* - * We don't want this to fail in ocfs2_write_end(), so do it - * here. - */ - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out_quota; - } /* * Fill our page array first. That way we've grabbed enough so @@ -1977,7 +1967,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { - int i; + int i, ret; unsigned from, to, start = pos & (PAGE_CACHE_SIZE - 1); struct inode *inode = mapping->host; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2027,6 +2017,14 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } } + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + copied = ret; + mlog_errno(ret); + goto out; + } + out_write_size: pos += copied; if (pos > i_size_read(inode)) { @@ -2041,6 +2039,7 @@ out_write_size: ocfs2_update_inode_fsync_trans(handle, inode, 1); ocfs2_journal_dirty(handle, wc->w_di_bh); +out: ocfs2_commit_trans(osb, handle); ocfs2_run_deallocs(osb, &wc->w_dealloc); diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 1ec141e758d7..62e8ec619b4c 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -160,9 +160,18 @@ static void o2quo_make_decision(struct work_struct *work) } out: - spin_unlock(&qs->qs_lock); - if (fence) + if (fence) { + spin_unlock(&qs->qs_lock); o2quo_fence_self(); + } else { + mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " + "connected: %d, lowest: %d (%sreachable)\n", + qs->qs_heartbeating, qs->qs_connected, lowest_hb, + lowest_reachable ? "" : "un"); + spin_unlock(&qs->qs_lock); + + } + } static void o2quo_set_hold(struct o2quo_state *qs, u8 node) diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 681691bc233a..ea34952f9496 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1480,6 +1480,14 @@ static int o2net_set_nodelay(struct socket *sock) return ret; } +static int o2net_set_usertimeout(struct socket *sock) +{ + int user_timeout = O2NET_TCP_USER_TIMEOUT; + + return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, + (char *)&user_timeout, sizeof(user_timeout)); +} + static void o2net_initialize_handshake(void) { o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( @@ -1536,16 +1544,20 @@ static void o2net_idle_timer(unsigned long data) #endif printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been " - "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc), - msecs / 1000, msecs % 1000); + "idle for %lu.%lu secs.\n", + SC_NODEF_ARGS(sc), msecs / 1000, msecs % 1000); - /* - * Initialize the nn_timeout so that the next connection attempt - * will continue in o2net_start_connect. + /* idle timerout happen, don't shutdown the connection, but + * make fence decision. Maybe the connection can recover before + * the decision is made. */ atomic_set(&nn->nn_timeout, 1); + o2quo_conn_err(o2net_num_from_nn(nn)); + queue_delayed_work(o2net_wq, &nn->nn_still_up, + msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); + + o2net_sc_reset_idle_timer(sc); - o2net_sc_queue_work(sc, &sc->sc_shutdown_work); } static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) @@ -1560,6 +1572,15 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc) static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) { + struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); + + /* clear fence decision since the connection recover from timeout*/ + if (atomic_read(&nn->nn_timeout)) { + o2quo_conn_up(o2net_num_from_nn(nn)); + cancel_delayed_work(&nn->nn_still_up); + atomic_set(&nn->nn_timeout, 0); + } + /* Only push out an existing timer */ if (timer_pending(&sc->sc_idle_timeout)) o2net_sc_reset_idle_timer(sc); @@ -1650,6 +1671,12 @@ static void o2net_start_connect(struct work_struct *work) goto out; } + ret = o2net_set_usertimeout(sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + o2net_register_callbacks(sc->sc_sock->sk, sc); spin_lock(&nn->nn_lock); @@ -1831,6 +1858,12 @@ static int o2net_accept_one(struct socket *sock, int *more) goto out; } + ret = o2net_set_usertimeout(new_sock); + if (ret) { + mlog(ML_ERROR, "set TCP_USER_TIMEOUT failed with %d\n", ret); + goto out; + } + slen = sizeof(sin); ret = new_sock->ops->getname(new_sock, (struct sockaddr *) &sin, &slen, 1); diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h index 5bada2a69b50..c571e849fda4 100644 --- a/fs/ocfs2/cluster/tcp.h +++ b/fs/ocfs2/cluster/tcp.h @@ -63,6 +63,7 @@ typedef void (o2net_post_msg_handler_func)(int status, void *data, #define O2NET_KEEPALIVE_DELAY_MS_DEFAULT 2000 #define O2NET_IDLE_TIMEOUT_MS_DEFAULT 30000 +#define O2NET_TCP_USER_TIMEOUT 0x7fffffff /* TODO: figure this out.... */ static inline int o2net_link_down(int err, struct socket *sock) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 39efc5057a36..3fcf205ee900 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1923,12 +1923,11 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } - if (total_backoff > - msecs_to_jiffies(DLM_JOIN_TIMEOUT_MSECS)) { + if (total_backoff > DLM_JOIN_TIMEOUT_MSECS) { status = -ERESTARTSYS; mlog(ML_NOTICE, "Timed out joining dlm domain " "%s after %u msecs\n", dlm->name, - jiffies_to_msecs(total_backoff)); + total_backoff); goto bail; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 82abf0cc9a12..eb42b2988765 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -694,14 +694,6 @@ void __dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, res->inflight_assert_workers); } -static void dlm_lockres_grab_inflight_worker(struct dlm_ctxt *dlm, - struct dlm_lock_resource *res) -{ - spin_lock(&res->spinlock); - __dlm_lockres_grab_inflight_worker(dlm, res); - spin_unlock(&res->spinlock); -} - static void __dlm_lockres_drop_inflight_worker(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) { @@ -1635,6 +1627,7 @@ send_response: } mlog(0, "%u is the owner of %.*s, cleaning everyone else\n", dlm->node_num, res->lockname.len, res->lockname.name); + spin_lock(&res->spinlock); ret = dlm_dispatch_assert_master(dlm, res, 0, request->node_idx, DLM_ASSERT_MASTER_MLE_CLEANUP); if (ret < 0) { @@ -1642,7 +1635,8 @@ send_response: response = DLM_MASTER_RESP_ERROR; dlm_lockres_put(res); } else - dlm_lockres_grab_inflight_worker(dlm, res); + __dlm_lockres_grab_inflight_worker(dlm, res); + spin_unlock(&res->spinlock); } else { if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 437de7f768c6..21708cda4b8a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1192,17 +1192,9 @@ void ocfs2_evict_inode(struct inode *inode) int ocfs2_drop_inode(struct inode *inode) { struct ocfs2_inode_info *oi = OCFS2_I(inode); - int res; - trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags); - - if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED) - res = 1; - else - res = generic_drop_inode(inode); - - return res; + return 1; } /* diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 6f66b3751ace..53e6c40ed4c6 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -35,9 +35,8 @@ copy_to_user((typeof(a) __user *)b, &(a), sizeof(a)) /* - * This call is void because we are already reporting an error that may - * be -EFAULT. The error will be returned from the ioctl(2) call. It's - * just a best-effort to tell userspace that this request caused the error. + * This is just a best-effort to tell userspace that this request + * caused the error. */ static inline void o2info_set_request_error(struct ocfs2_info_request *kreq, struct ocfs2_info_request __user *req) @@ -146,136 +145,105 @@ bail: static int ocfs2_info_handle_blocksize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_blocksize oib; if (o2info_from_user(oib, req)) - goto bail; + return -EFAULT; oib.ib_blocksize = inode->i_sb->s_blocksize; o2info_set_request_filled(&oib.ib_req); if (o2info_to_user(oib, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oib.ib_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_clustersize(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_clustersize oic; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oic, req)) - goto bail; + return -EFAULT; oic.ic_clustersize = osb->s_clustersize; o2info_set_request_filled(&oic.ic_req); if (o2info_to_user(oic, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oic.ic_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_maxslots(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_maxslots oim; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oim, req)) - goto bail; + return -EFAULT; oim.im_max_slots = osb->max_slots; o2info_set_request_filled(&oim.im_req); if (o2info_to_user(oim, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oim.im_req, req); - - return status; + return 0; } static int ocfs2_info_handle_label(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_label oil; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oil, req)) - goto bail; + return -EFAULT; memcpy(oil.il_label, osb->vol_label, OCFS2_MAX_VOL_LABEL_LEN); o2info_set_request_filled(&oil.il_req); if (o2info_to_user(oil, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oil.il_req, req); - - return status; + return 0; } static int ocfs2_info_handle_uuid(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_uuid oiu; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oiu, req)) - goto bail; + return -EFAULT; memcpy(oiu.iu_uuid_str, osb->uuid_str, OCFS2_TEXT_UUID_LEN + 1); o2info_set_request_filled(&oiu.iu_req); if (o2info_to_user(oiu, req)) - goto bail; - - status = 0; -bail: - if (status) - o2info_set_request_error(&oiu.iu_req, req); + return -EFAULT; - return status; + return 0; } static int ocfs2_info_handle_fs_features(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_fs_features oif; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oif, req)) - goto bail; + return -EFAULT; oif.if_compat_features = osb->s_feature_compat; oif.if_incompat_features = osb->s_feature_incompat; @@ -284,39 +252,28 @@ static int ocfs2_info_handle_fs_features(struct inode *inode, o2info_set_request_filled(&oif.if_req); if (o2info_to_user(oif, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oif.if_req, req); - - return status; + return 0; } static int ocfs2_info_handle_journal_size(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_journal_size oij; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if (o2info_from_user(oij, req)) - goto bail; + return -EFAULT; oij.ij_journal_size = i_size_read(osb->journal->j_inode); o2info_set_request_filled(&oij.ij_req); if (o2info_to_user(oij, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oij.ij_req, req); - - return status; + return 0; } static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, @@ -373,7 +330,7 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, u32 i; u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = INODE_ALLOC_SYSTEM_INODE; + int status, type = INODE_ALLOC_SYSTEM_INODE; struct ocfs2_info_freeinode *oifi = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct inode *inode_alloc = NULL; @@ -385,8 +342,10 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, goto out_err; } - if (o2info_from_user(*oifi, req)) - goto bail; + if (o2info_from_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } oifi->ifi_slotnum = osb->max_slots; @@ -424,14 +383,16 @@ static int ocfs2_info_handle_freeinode(struct inode *inode, o2info_set_request_filled(&oifi->ifi_req); - if (o2info_to_user(*oifi, req)) - goto bail; + if (o2info_to_user(*oifi, req)) { + status = -EFAULT; + goto out_free; + } status = 0; bail: if (status) o2info_set_request_error(&oifi->ifi_req, req); - +out_free: kfree(oifi); out_err: return status; @@ -658,7 +619,7 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, { u64 blkno = -1; char namebuf[40]; - int status = -EFAULT, type = GLOBAL_BITMAP_SYSTEM_INODE; + int status, type = GLOBAL_BITMAP_SYSTEM_INODE; struct ocfs2_info_freefrag *oiff; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -671,8 +632,10 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, goto out_err; } - if (o2info_from_user(*oiff, req)) - goto bail; + if (o2info_from_user(*oiff, req)) { + status = -EFAULT; + goto out_free; + } /* * chunksize from userspace should be power of 2. */ @@ -711,14 +674,14 @@ static int ocfs2_info_handle_freefrag(struct inode *inode, if (o2info_to_user(*oiff, req)) { status = -EFAULT; - goto bail; + goto out_free; } status = 0; bail: if (status) o2info_set_request_error(&oiff->iff_req, req); - +out_free: kfree(oiff); out_err: return status; @@ -727,23 +690,17 @@ out_err: static int ocfs2_info_handle_unknown(struct inode *inode, struct ocfs2_info_request __user *req) { - int status = -EFAULT; struct ocfs2_info_request oir; if (o2info_from_user(oir, req)) - goto bail; + return -EFAULT; o2info_clear_request_filled(&oir); if (o2info_to_user(oir, req)) - goto bail; + return -EFAULT; - status = 0; -bail: - if (status) - o2info_set_request_error(&oir, req); - - return status; + return 0; } /* diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 599eb4c4c8be..6219aaadeb08 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -98,7 +98,7 @@ static int __ocfs2_move_extent(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(inode->i_sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 636aab69ead5..d81f6e2a97f5 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -3109,7 +3109,7 @@ static int ocfs2_clear_ext_refcount(handle_t *handle, el = path_leaf_el(path); index = ocfs2_search_extent_list(el, cpos); - if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { + if (index == -1) { ocfs2_error(sb, "Inode %llu has an extent at cpos %u which can no " "longer be found.\n", diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index 1424c151cccc..a88b2a4fcc85 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -382,7 +382,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, trace_ocfs2_map_slot_buffers(bytes, si->si_blocks); - si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks, + si->si_bh = kcalloc(si->si_blocks, sizeof(struct buffer_head *), GFP_KERNEL); if (!si->si_bh) { status = -ENOMEM; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ec58c7659183..ba8819702c56 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -321,7 +321,7 @@ static int omfs_get_imap(struct super_block *sb) goto out; sbi->s_imap_size = array_size; - sbi->s_imap = kzalloc(array_size * sizeof(unsigned long *), GFP_KERNEL); + sbi->s_imap = kcalloc(array_size, sizeof(unsigned long *), GFP_KERNEL); if (!sbi->s_imap) goto nomem; diff --git a/fs/proc/base.c b/fs/proc/base.c index e442784ef74a..f50d4be6b27f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2445,7 +2445,7 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) #ifdef CONFIG_USER_NS static int proc_id_map_open(struct inode *inode, struct file *file, - struct seq_operations *seq_ops) + const struct seq_operations *seq_ops) { struct user_namespace *ns = NULL; struct task_struct *task; @@ -2774,12 +2774,12 @@ out: struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int flags) { - int result = 0; + int result = -ENOENT; struct task_struct *task; unsigned tgid; struct pid_namespace *ns; - tgid = name_to_int(dentry); + tgid = name_to_int(&dentry->d_name); if (tgid == ~0U) goto out; @@ -3027,7 +3027,7 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry if (!leader) goto out_no_task; - tid = name_to_int(dentry); + tid = name_to_int(&dentry->d_name); if (tid == ~0U) goto out; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index eb82e9f00f58..e11d7c590bb0 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -204,7 +204,7 @@ static struct dentry *proc_lookupfd_common(struct inode *dir, { struct task_struct *task = get_proc_task(dir); int result = -ENOENT; - unsigned fd = name_to_int(dentry); + unsigned fd = name_to_int(&dentry->d_name); if (!task) goto out_no_task; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index b7f268eb5f45..317b72641ebf 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -27,7 +27,7 @@ #include "internal.h" -DEFINE_SPINLOCK(proc_subdir_lock); +static DEFINE_SPINLOCK(proc_subdir_lock); static int proc_match(unsigned int len, const char *name, struct proc_dir_entry *de) { @@ -330,28 +330,28 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, nlink_t nlink) { struct proc_dir_entry *ent = NULL; - const char *fn = name; - unsigned int len; - - /* make sure name is valid */ - if (!name || !strlen(name)) - goto out; + const char *fn; + struct qstr qstr; if (xlate_proc_name(name, parent, &fn) != 0) goto out; + qstr.name = fn; + qstr.len = strlen(fn); + if (qstr.len == 0 || qstr.len >= 256) { + WARN(1, "name len %u\n", qstr.len); + return NULL; + } + if (*parent == &proc_root && name_to_int(&qstr) != ~0U) { + WARN(1, "create '/proc/%s' by hand\n", qstr.name); + return NULL; + } - /* At this point there must not be any '/' characters beyond *fn */ - if (strchr(fn, '/')) - goto out; - - len = strlen(fn); - - ent = kzalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL); + ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) goto out; - memcpy(ent->name, fn, len + 1); - ent->namelen = len; + memcpy(ent->name, fn, qstr.len + 1); + ent->namelen = qstr.len; ent->mode = mode; ent->nlink = nlink; atomic_set(&ent->count, 1); diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3ab6d14e71c5..a17accbd05ee 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -112,10 +112,10 @@ static inline int task_dumpable(struct task_struct *task) return 0; } -static inline unsigned name_to_int(struct dentry *dentry) +static inline unsigned name_to_int(const struct qstr *qstr) { - const char *name = dentry->d_name.name; - int len = dentry->d_name.len; + const char *name = qstr->name; + int len = qstr->len; unsigned n = 0; if (len > 1 && *name == '0') @@ -178,8 +178,6 @@ extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, i /* * generic.c */ -extern spinlock_t proc_subdir_lock; - extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *, struct dentry *); diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 39e6ef32f0bd..6df8d0722c97 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -172,7 +172,7 @@ get_sparsemem_vmemmap_info(struct kcore_list *ent, struct list_head *head) start = ((unsigned long)pfn_to_page(pfn)) & PAGE_MASK; end = ((unsigned long)pfn_to_page(pfn + nr_pages)) - 1; - end = ALIGN(end, PAGE_SIZE); + end = PAGE_ALIGN(end); /* overlap check (because we have to align page */ list_for_each_entry(tmp, head, list) { if (tmp->type != KCORE_VMEMMAP) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 7445af0b1aa3..aa1eee06420f 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -168,7 +168,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_WRITEBACK)), K(global_page_state(NR_ANON_PAGES)), K(global_page_state(NR_FILE_MAPPED)), - K(global_page_state(NR_SHMEM)), + K(i.sharedram), K(global_page_state(NR_SLAB_RECLAIMABLE) + global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_SLAB_RECLAIMABLE)), diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 71290463a1d3..f92d5dd578a4 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -632,7 +632,7 @@ out: return ret; } -static int scan(struct ctl_table_header *head, ctl_table *table, +static int scan(struct ctl_table_header *head, struct ctl_table *table, unsigned long *pos, struct file *file, struct dir_context *ctx) { diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index cb761f010300..15f327bed8c6 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -18,7 +18,7 @@ /* * The /proc/tty directory inodes... */ -static struct proc_dir_entry *proc_tty_ldisc, *proc_tty_driver; +static struct proc_dir_entry *proc_tty_driver; /* * This is the handler for /proc/tty/drivers @@ -176,7 +176,7 @@ void __init proc_tty_init(void) { if (!proc_mkdir("tty", NULL)) return; - proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); + proc_mkdir("tty/ldisc", NULL); /* Preserved: it's userspace visible */ /* * /proc/tty/driver/serial reveals the exact character counts for * serial links which is just too easy to abuse for inferring diff --git a/fs/proc/root.c b/fs/proc/root.c index 5dbadecb234d..574bafc41f0b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -199,10 +199,10 @@ static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) { - if (!proc_lookup(dir, dentry, flags)) + if (!proc_pid_lookup(dir, dentry, flags)) return NULL; - return proc_pid_lookup(dir, dentry, flags); + return proc_lookup(dir, dentry, flags); } static int proc_root_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 382aa890e228..a18e651a4d1c 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -328,6 +328,82 @@ static inline char *alloc_elfnotes_buf(size_t notes_sz) * virtually contiguous user-space in ELF layout. */ #ifdef CONFIG_MMU +/* + * remap_oldmem_pfn_checked - do remap_oldmem_pfn_range replacing all pages + * reported as not being ram with the zero page. + * + * @vma: vm_area_struct describing requested mapping + * @from: start remapping from + * @pfn: page frame number to start remapping to + * @size: remapping size + * @prot: protection bits + * + * Returns zero on success, -EAGAIN on failure. + */ +int remap_oldmem_pfn_checked(struct vm_area_struct *vma, unsigned long from, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + unsigned long map_size; + unsigned long pos_start, pos_end, pos; + unsigned long zeropage_pfn = my_zero_pfn(0); + size_t len = 0; + + pos_start = pfn; + pos_end = pfn + (size >> PAGE_SHIFT); + + for (pos = pos_start; pos < pos_end; ++pos) { + if (!pfn_is_ram(pos)) { + /* + * We hit a page which is not ram. Remap the continuous + * region between pos_start and pos-1 and replace + * the non-ram page at pos with the zero page. + */ + if (pos > pos_start) { + /* Remap continuous region */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, + pos_start, map_size, + prot)) + goto fail; + len += map_size; + } + /* Remap the zero page */ + if (remap_oldmem_pfn_range(vma, from + len, + zeropage_pfn, + PAGE_SIZE, prot)) + goto fail; + len += PAGE_SIZE; + pos_start = pos + 1; + } + } + if (pos > pos_start) { + /* Remap the rest */ + map_size = (pos - pos_start) << PAGE_SHIFT; + if (remap_oldmem_pfn_range(vma, from + len, pos_start, + map_size, prot)) + goto fail; + } + return 0; +fail: + do_munmap(vma->vm_mm, from, len); + return -EAGAIN; +} + +int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, + unsigned long from, unsigned long pfn, + unsigned long size, pgprot_t prot) +{ + /* + * Check if oldmem_pfn_is_ram was registered to avoid + * looping over all pages without a reason. + */ + if (oldmem_pfn_is_ram) + return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + else + return remap_oldmem_pfn_range(vma, from, pfn, size, prot); +} + static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) { size_t size = vma->vm_end - vma->vm_start; @@ -387,9 +463,9 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) tsz = min_t(size_t, m->offset + m->size - start, size); paddr = m->paddr + start - m->offset; - if (remap_oldmem_pfn_range(vma, vma->vm_start + len, - paddr >> PAGE_SHIFT, tsz, - vma->vm_page_prot)) + if (vmcore_remap_oldmem_pfn(vma, vma->vm_start + len, + paddr >> PAGE_SHIFT, tsz, + vma->vm_page_prot)) goto fail; size -= tsz; start += tsz; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index 34a1e5aa848c..9d7b9a83699e 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -394,7 +394,7 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size) prot = pgprot_noncached(PAGE_KERNEL); - pages = kmalloc(sizeof(struct page *) * page_count, GFP_KERNEL); + pages = kmalloc_array(page_count, sizeof(struct page *), GFP_KERNEL); if (!pages) { pr_err("%s: Failed to allocate array for %u pages\n", __func__, page_count); diff --git a/fs/qnx6/Makefile b/fs/qnx6/Makefile index 9dd06199afc9..5e6bae6fae50 100644 --- a/fs/qnx6/Makefile +++ b/fs/qnx6/Makefile @@ -5,3 +5,4 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6.o qnx6-objs := inode.o dir.o namei.o super_mmi.o +ccflags-$(CONFIG_QNX6FS_DEBUG) += -DDEBUG diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 15b7d92ed60d..8d64bb5366bf 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -77,21 +77,20 @@ static int qnx6_dir_longfilename(struct inode *inode, if (de->de_size != 0xff) { /* error - long filename entries always have size 0xff in direntry */ - printk(KERN_ERR "qnx6: invalid direntry size (%i).\n", - de->de_size); + pr_err("invalid direntry size (%i).\n", de->de_size); return 0; } lf = qnx6_longname(s, de, &page); if (IS_ERR(lf)) { - printk(KERN_ERR "qnx6:Error reading longname\n"); + pr_err("Error reading longname\n"); return 0; } lf_size = fs16_to_cpu(sbi, lf->lf_size); if (lf_size > QNX6_LONG_NAME_MAX) { - QNX6DEBUG((KERN_INFO "file %s\n", lf->lf_fname)); - printk(KERN_ERR "qnx6:Filename too long (%i)\n", lf_size); + pr_debug("file %s\n", lf->lf_fname); + pr_err("Filename too long (%i)\n", lf_size); qnx6_put_page(page); return 0; } @@ -100,10 +99,10 @@ static int qnx6_dir_longfilename(struct inode *inode, mmi 3g filesystem does not have that checksum */ if (!test_opt(s, MMI_FS) && fs32_to_cpu(sbi, de->de_checksum) != qnx6_lfile_checksum(lf->lf_fname, lf_size)) - printk(KERN_INFO "qnx6: long filename checksum error.\n"); + pr_info("long filename checksum error.\n"); - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s inode:%u\n", - lf_size, lf->lf_fname, de_inode)); + pr_debug("qnx6_readdir:%.*s inode:%u\n", + lf_size, lf->lf_fname, de_inode); if (!dir_emit(ctx, lf->lf_fname, lf_size, de_inode, DT_UNKNOWN)) { qnx6_put_page(page); return 0; @@ -136,7 +135,7 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) int i = start; if (IS_ERR(page)) { - printk(KERN_ERR "qnx6_readdir: read failed\n"); + pr_err("%s(): read failed\n", __func__); ctx->pos = (n + 1) << PAGE_CACHE_SHIFT; return PTR_ERR(page); } @@ -159,9 +158,9 @@ static int qnx6_readdir(struct file *file, struct dir_context *ctx) break; } } else { - QNX6DEBUG((KERN_INFO "qnx6_readdir:%.*s" - " inode:%u\n", size, de->de_fname, - no_inode)); + pr_debug("%s():%.*s inode:%u\n", + __func__, size, de->de_fname, + no_inode); if (!dir_emit(ctx, de->de_fname, size, no_inode, DT_UNKNOWN)) { done = true; @@ -259,8 +258,7 @@ unsigned qnx6_find_entry(int len, struct inode *dir, const char *name, if (ino) goto found; } else - printk(KERN_ERR "qnx6: undefined " - "filename size in inode.\n"); + pr_err("undefined filename size in inode.\n"); } qnx6_put_page(page); } diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 65cdaab3ed49..44e73923670d 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -73,8 +73,8 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, { unsigned phys; - QNX6DEBUG((KERN_INFO "qnx6: qnx6_get_block inode=[%ld] iblock=[%ld]\n", - inode->i_ino, (unsigned long)iblock)); + pr_debug("qnx6_get_block inode=[%ld] iblock=[%ld]\n", + inode->i_ino, (unsigned long)iblock); phys = qnx6_block_map(inode, iblock); if (phys) { @@ -87,7 +87,7 @@ static int qnx6_get_block(struct inode *inode, sector_t iblock, static int qnx6_check_blockptr(__fs32 ptr) { if (ptr == ~(__fs32)0) { - printk(KERN_ERR "qnx6: hit unused blockpointer.\n"); + pr_err("hit unused blockpointer.\n"); return 0; } return 1; @@ -127,8 +127,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) levelptr = no >> bitdelta; if (levelptr > QNX6_NO_DIRECT_POINTERS - 1) { - printk(KERN_ERR "qnx6:Requested file block number (%u) too big.", - no); + pr_err("Requested file block number (%u) too big.", no); return 0; } @@ -137,8 +136,7 @@ static unsigned qnx6_block_map(struct inode *inode, unsigned no) for (i = 0; i < depth; i++) { bh = sb_bread(s, block); if (!bh) { - printk(KERN_ERR "qnx6:Error reading block (%u)\n", - block); + pr_err("Error reading block (%u)\n", block); return 0; } bitdelta -= ptrbits; @@ -207,26 +205,16 @@ void qnx6_superblock_debug(struct qnx6_super_block *sb, struct super_block *s) { struct qnx6_sb_info *sbi = QNX6_SB(s); - QNX6DEBUG((KERN_INFO "magic: %08x\n", - fs32_to_cpu(sbi, sb->sb_magic))); - QNX6DEBUG((KERN_INFO "checksum: %08x\n", - fs32_to_cpu(sbi, sb->sb_checksum))); - QNX6DEBUG((KERN_INFO "serial: %llx\n", - fs64_to_cpu(sbi, sb->sb_serial))); - QNX6DEBUG((KERN_INFO "flags: %08x\n", - fs32_to_cpu(sbi, sb->sb_flags))); - QNX6DEBUG((KERN_INFO "blocksize: %08x\n", - fs32_to_cpu(sbi, sb->sb_blocksize))); - QNX6DEBUG((KERN_INFO "num_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_inodes))); - QNX6DEBUG((KERN_INFO "free_inodes: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_inodes))); - QNX6DEBUG((KERN_INFO "num_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_num_blocks))); - QNX6DEBUG((KERN_INFO "free_blocks: %08x\n", - fs32_to_cpu(sbi, sb->sb_free_blocks))); - QNX6DEBUG((KERN_INFO "inode_levels: %02x\n", - sb->Inode.levels)); + pr_debug("magic: %08x\n", fs32_to_cpu(sbi, sb->sb_magic)); + pr_debug("checksum: %08x\n", fs32_to_cpu(sbi, sb->sb_checksum)); + pr_debug("serial: %llx\n", fs64_to_cpu(sbi, sb->sb_serial)); + pr_debug("flags: %08x\n", fs32_to_cpu(sbi, sb->sb_flags)); + pr_debug("blocksize: %08x\n", fs32_to_cpu(sbi, sb->sb_blocksize)); + pr_debug("num_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_num_inodes)); + pr_debug("free_inodes: %08x\n", fs32_to_cpu(sbi, sb->sb_free_inodes)); + pr_debug("num_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_num_blocks)); + pr_debug("free_blocks: %08x\n", fs32_to_cpu(sbi, sb->sb_free_blocks)); + pr_debug("inode_levels: %02x\n", sb->Inode.levels); } #endif @@ -277,7 +265,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, start with the first superblock */ bh = sb_bread(s, offset); if (!bh) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); return NULL; } sb = (struct qnx6_super_block *)bh->b_data; @@ -285,20 +273,16 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s, sbi->s_bytesex = BYTESEX_BE; if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) { /* we got a big endian fs */ - QNX6DEBUG((KERN_INFO "qnx6: fs got different" - " endianness.\n")); + pr_debug("fs got different endianness.\n"); return bh; } else sbi->s_bytesex = BYTESEX_LE; if (!silent) { if (offset == 0) { - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); } else { - printk(KERN_INFO "qnx6: wrong signature (magic)" - " at position (0x%lx) - will try" - " alternative position (0x0000).\n", - offset * s->s_blocksize); + pr_info("wrong signature (magic) at position (0x%lx) - will try alternative position (0x0000).\n", + offset * s->s_blocksize); } } brelse(bh); @@ -329,13 +313,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* Superblock always is 512 Byte long */ if (!sb_set_blocksize(s, QNX6_SUPERBLOCK_SIZE)) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto outnobh; } /* parse the mount-options */ if (!qnx6_parse_options((char *) data, s)) { - printk(KERN_ERR "qnx6: invalid mount options.\n"); + pr_err("invalid mount options.\n"); goto outnobh; } if (test_opt(s, MMI_FS)) { @@ -355,7 +339,7 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* try again without bootblock offset */ bh1 = qnx6_check_first_superblock(s, 0, silent); if (!bh1) { - printk(KERN_ERR "qnx6: unable to read the first superblock\n"); + pr_err("unable to read the first superblock\n"); goto outnobh; } /* seems that no bootblock at partition start */ @@ -370,13 +354,13 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -398,21 +382,20 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) /* next the second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic)" - " in superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #2 checksum error\n"); + pr_err("superblock #2 checksum error\n"); goto out; } @@ -422,25 +405,24 @@ static int qnx6_fill_super(struct super_block *s, void *data, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } mmi_success: /* sanity check - limit maximum indirect pointer levels */ if (sb1->Inode.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many inode levels (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); + pr_err("too many inode levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Inode.levels); goto out; } if (sb1->Longfile.levels > QNX6_PTR_MAX_LEVELS) { - printk(KERN_ERR "qnx6: too many longfilename levels" - " (max %i, sb %i)\n", - QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); + pr_err("too many longfilename levels (max %i, sb %i)\n", + QNX6_PTR_MAX_LEVELS, sb1->Longfile.levels); goto out; } s->s_op = &qnx6_sops; @@ -460,7 +442,7 @@ mmi_success: /* prefetch root inode */ root = qnx6_iget(s, QNX6_ROOT_INO); if (IS_ERR(root)) { - printk(KERN_ERR "qnx6: get inode failed\n"); + pr_err("get inode failed\n"); ret = PTR_ERR(root); goto out2; } @@ -474,7 +456,7 @@ mmi_success: errmsg = qnx6_checkroot(s); if (errmsg != NULL) { if (!silent) - printk(KERN_ERR "qnx6: %s\n", errmsg); + pr_err("%s\n", errmsg); goto out3; } return 0; @@ -555,8 +537,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mode = 0; if (ino == 0) { - printk(KERN_ERR "qnx6: bad inode number on dev %s: %u is " - "out of range\n", + pr_err("bad inode number on dev %s: %u is out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); @@ -566,8 +547,8 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) mapping = sbi->inodes->i_mapping; page = read_mapping_page(mapping, n, NULL); if (IS_ERR(page)) { - printk(KERN_ERR "qnx6: major problem: unable to read inode from " - "dev %s\n", sb->s_id); + pr_err("major problem: unable to read inode from dev %s\n", + sb->s_id); iget_failed(inode); return ERR_CAST(page); } @@ -689,7 +670,7 @@ static int __init init_qnx6_fs(void) return err; } - printk(KERN_INFO "QNX6 filesystem 1.0.0 registered.\n"); + pr_info("QNX6 filesystem 1.0.0 registered.\n"); return 0; } diff --git a/fs/qnx6/namei.c b/fs/qnx6/namei.c index 0561326a94f5..6c1a323137dd 100644 --- a/fs/qnx6/namei.c +++ b/fs/qnx6/namei.c @@ -29,12 +29,12 @@ struct dentry *qnx6_lookup(struct inode *dir, struct dentry *dentry, foundinode = qnx6_iget(dir->i_sb, ino); qnx6_put_page(page); if (IS_ERR(foundinode)) { - QNX6DEBUG((KERN_ERR "qnx6: lookup->iget -> " - " error %ld\n", PTR_ERR(foundinode))); + pr_debug("lookup->iget -> error %ld\n", + PTR_ERR(foundinode)); return ERR_CAST(foundinode); } } else { - QNX6DEBUG((KERN_INFO "qnx6_lookup: not found %s\n", name)); + pr_debug("%s(): not found %s\n", __func__, name); return NULL; } d_add(dentry, foundinode); diff --git a/fs/qnx6/qnx6.h b/fs/qnx6/qnx6.h index b00fcc960d37..d3fb2b698800 100644 --- a/fs/qnx6/qnx6.h +++ b/fs/qnx6/qnx6.h @@ -10,6 +10,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/pagemap.h> @@ -19,12 +25,6 @@ typedef __u64 __bitwise __fs64; #include <linux/qnx6_fs.h> -#ifdef CONFIG_QNX6FS_DEBUG -#define QNX6DEBUG(X) printk X -#else -#define QNX6DEBUG(X) (void) 0 -#endif - struct qnx6_sb_info { struct buffer_head *sb_buf; /* superblock buffer */ struct qnx6_super_block *sb; /* our superblock */ diff --git a/fs/qnx6/super_mmi.c b/fs/qnx6/super_mmi.c index 29c32cba62d6..62aaf3e3126a 100644 --- a/fs/qnx6/super_mmi.c +++ b/fs/qnx6/super_mmi.c @@ -44,15 +44,14 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) start with the first superblock */ bh1 = sb_bread(s, 0); if (!bh1) { - printk(KERN_ERR "qnx6: Unable to read first mmi superblock\n"); + pr_err("Unable to read first mmi superblock\n"); return NULL; } sb1 = (struct qnx6_mmi_super_block *)bh1->b_data; sbi = QNX6_SB(s); if (fs32_to_cpu(sbi, sb1->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) { - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #1.\n"); + pr_err("wrong signature (magic) in superblock #1.\n"); goto out; } } @@ -60,7 +59,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb1->sb_checksum) != crc32_be(0, (char *)(bh1->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } @@ -70,7 +69,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* set new blocksize */ if (!sb_set_blocksize(s, fs32_to_cpu(sbi, sb1->sb_blocksize))) { - printk(KERN_ERR "qnx6: unable to set blocksize\n"); + pr_err("unable to set blocksize\n"); goto out; } /* blocksize invalidates bh - pull it back in */ @@ -83,27 +82,26 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) /* read second superblock */ bh2 = sb_bread(s, offset); if (!bh2) { - printk(KERN_ERR "qnx6: unable to read the second superblock\n"); + pr_err("unable to read the second superblock\n"); goto out; } sb2 = (struct qnx6_mmi_super_block *)bh2->b_data; if (fs32_to_cpu(sbi, sb2->sb_magic) != QNX6_SUPER_MAGIC) { if (!silent) - printk(KERN_ERR "qnx6: wrong signature (magic) in" - " superblock #2.\n"); + pr_err("wrong signature (magic) in superblock #2.\n"); goto out; } /* checksum check - start at byte 8 and end at byte 512 */ if (fs32_to_cpu(sbi, sb2->sb_checksum) != crc32_be(0, (char *)(bh2->b_data + 8), 504)) { - printk(KERN_ERR "qnx6: superblock #1 checksum error\n"); + pr_err("superblock #1 checksum error\n"); goto out; } qsb = kmalloc(sizeof(*qsb), GFP_KERNEL); if (!qsb) { - printk(KERN_ERR "qnx6: unable to allocate memory.\n"); + pr_err("unable to allocate memory.\n"); goto out; } @@ -119,7 +117,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh1; sbi->sb = (struct qnx6_super_block *)bh1->b_data; brelse(bh2); - printk(KERN_INFO "qnx6: superblock #1 active\n"); + pr_info("superblock #1 active\n"); } else { /* superblock #2 active */ qnx6_mmi_copy_sb(qsb, sb2); @@ -131,7 +129,7 @@ struct qnx6_super_block *qnx6_mmi_fill_super(struct super_block *s, int silent) sbi->sb_buf = bh2; sbi->sb = (struct qnx6_super_block *)bh2->b_data; brelse(bh1); - printk(KERN_INFO "qnx6: superblock #2 active\n"); + pr_info("superblock #2 active\n"); } kfree(qsb); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index dda012ad4208..bbafbde3471a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -222,7 +222,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, /* gang-find the pages */ ret = -ENOMEM; - pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(lpages, sizeof(struct page *), GFP_KERNEL); if (!pages) goto out_free; diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index d9f5a60dd59b..0a7dc941aaf4 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -9,7 +9,7 @@ #include <linux/stat.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> extern const struct reiserfs_key MIN_KEY; diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 54fdf196bfb2..5739cb99de7b 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -10,7 +10,7 @@ * and using buffers obtained after all above. */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/time.h> #include "reiserfs.h" #include <linux/buffer_head.h> diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index db9e80ba53a0..751dd3f4346b 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -6,7 +6,7 @@ #include "reiserfs.h" #include "acl.h" #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/reiserfs/ibalance.c b/fs/reiserfs/ibalance.c index 73231b1ebdbe..b751eea32e20 100644 --- a/fs/reiserfs/ibalance.c +++ b/fs/reiserfs/ibalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 63b2b0ec49e6..a7eec9888f10 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -11,7 +11,7 @@ #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/unaligned.h> #include <linux/buffer_head.h> #include <linux/mpage.h> diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 501ed6811a2b..6ec8a30a0911 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -7,7 +7,7 @@ #include <linux/mount.h> #include "reiserfs.h" #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/pagemap.h> #include <linux/compat.h> diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index cfaee912ee09..aca73dd73906 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -54,7 +54,7 @@ static void sd_print_item(struct item_head *ih, char *item) } else { struct stat_data *sd = (struct stat_data *)item; - printk("\t0%-6o | %6Lu | %2u | %d | %s\n", sd_v2_mode(sd), + printk("\t0%-6o | %6llu | %2u | %d | %s\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), sd_v2_rdev(sd), print_time(sd_v2_mtime(sd))); } @@ -408,7 +408,7 @@ static void direntry_print_item(struct item_head *ih, char *item) namebuf[namelen + 2] = 0; } - printk("%d: %-15s%-15d%-15d%-15Ld%-15Ld(%s)\n", + printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", i, namebuf, deh_dir_id(deh), deh_objectid(deh), GET_HASH_VALUE(deh_offset(deh)), diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index d6744c8b24e1..814dda3ec998 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -2,7 +2,7 @@ * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index c9b47e91baf8..ae1dc841db3a 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -17,7 +17,7 @@ static char off_buf[80]; static char *reiserfs_cpu_offset(struct cpu_key *key) { if (cpu_key_k_type(key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(cpu_key_k_offset(key)), (unsigned long long) @@ -34,7 +34,7 @@ static char *le_offset(struct reiserfs_key *key) version = le_key_version(key); if (le_key_k_type(version, key) == TYPE_DIRENTRY) - sprintf(off_buf, "%Lu(%Lu)", + sprintf(off_buf, "%llu(%llu)", (unsigned long long) GET_HASH_VALUE(le_key_k_offset(version, key)), (unsigned long long) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 02b0b7d0f7d5..621b9f381fe1 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -11,7 +11,7 @@ #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index dd44468edc2b..24cbe013240f 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -2006,7 +2006,7 @@ int reiserfs_do_truncate(struct reiserfs_transaction_handle *th, &s_search_path) == POSITION_FOUND); RFALSE(file_size > ROUND_UP(new_file_size), - "PAP-5680: truncate did not finish: new_file_size %Ld, current %Ld, oid %d", + "PAP-5680: truncate did not finish: new_file_size %lld, current %lld, oid %d", new_file_size, file_size, s_item_key.on_disk_key.k_objectid); update_and_out: diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a392cef6acc6..709ea92d716f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -15,7 +15,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/time.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include "reiserfs.h" #include "acl.h" #include "xattr.h" @@ -331,7 +331,7 @@ static int finish_unfinished(struct super_block *s) * not completed truncate found. New size was * committed together with "save" link */ - reiserfs_info(s, "Truncating %k to %Ld ..", + reiserfs_info(s, "Truncating %k to %lld ..", INODE_PKEY(inode), inode->i_size); /* don't update modification time */ @@ -1577,7 +1577,7 @@ static int read_super_block(struct super_block *s, int offset) rs = (struct reiserfs_super_block *)bh->b_data; if (sb_blocksize(rs) != s->s_blocksize) { reiserfs_warning(s, "sh-2011", "can't find a reiserfs " - "filesystem on (dev %s, block %Lu, size %lu)", + "filesystem on (dev %s, block %llu, size %lu)", s->s_id, (unsigned long long)bh->b_blocknr, s->s_blocksize); @@ -2441,8 +2441,7 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh, *bh; if (!current->journal_info) { - printk(KERN_WARNING "reiserfs: Quota write (off=%Lu, len=%Lu)" - " cancelled because transaction is not started.\n", + printk(KERN_WARNING "reiserfs: Quota write (off=%llu, len=%llu) cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); return -EIO; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index ca416d099e7d..7c36898af402 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -45,7 +45,7 @@ #include <linux/xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/checksum.h> #include <linux/stat.h> #include <linux/quotaops.h> @@ -84,6 +84,7 @@ static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -98,6 +99,7 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; + BUG_ON(!mutex_is_locked(&dir->i_mutex)); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); @@ -117,6 +119,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) { struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; + if (!privroot->d_inode) return ERR_PTR(-ENODATA); @@ -127,6 +130,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) xaroot = ERR_PTR(-ENODATA); else if (!xaroot->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(privroot->d_inode, xaroot, 0700); if (err) { @@ -157,6 +161,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && !xadir->d_inode) { int err = -ENODATA; + if (xattr_may_create(flags)) err = xattr_mkdir(xaroot->d_inode, xadir, 0700); if (err) { @@ -188,6 +193,7 @@ fill_with_dentries(void *buf, const char *name, int namelen, loff_t offset, { struct reiserfs_dentry_buf *dbuf = buf; struct dentry *dentry; + WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) @@ -218,6 +224,7 @@ static void cleanup_dentry_buf(struct reiserfs_dentry_buf *buf) { int i; + for (i = 0; i < buf->count; i++) if (buf->dentries[i]) dput(buf->dentries[i]); @@ -283,11 +290,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, int blocks = JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + reiserfs_write_lock(inode->i_sb); err = journal_begin(&th, inode->i_sb, blocks); reiserfs_write_unlock(inode->i_sb); if (!err) { int jerror; + mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, I_MUTEX_XATTR); err = action(dir, data); @@ -340,6 +349,7 @@ static int chown_one_xattr(struct dentry *dentry, void *data) int reiserfs_delete_xattrs(struct inode *inode) { int err = reiserfs_for_each_xattr(inode, delete_one_xattr, NULL); + if (err) reiserfs_warning(inode->i_sb, "jdm-20004", "Couldn't delete all xattrs (%d)\n", err); @@ -350,6 +360,7 @@ int reiserfs_delete_xattrs(struct inode *inode) int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs) { int err = reiserfs_for_each_xattr(inode, chown_one_xattr, attrs); + if (err) reiserfs_warning(inode->i_sb, "jdm-20007", "Couldn't chown all xattrs (%d)\n", err); @@ -439,6 +450,7 @@ int reiserfs_commit_write(struct file *f, struct page *page, static void update_ctime(struct inode *inode) { struct timespec now = current_fs_time(inode->i_sb); + if (inode_unhashed(inode) || !inode->i_nlink || timespec_equal(&inode->i_ctime, &now)) return; @@ -514,6 +526,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, size_t chunk; size_t skip = 0; size_t page_offset = (file_pos & (PAGE_CACHE_SIZE - 1)); + if (buffer_size - buffer_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -530,6 +543,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (file_pos == 0) { struct reiserfs_xattr_header *rxh; + skip = file_pos = sizeof(struct reiserfs_xattr_header); if (chunk + skip > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE - skip; @@ -659,6 +673,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, size_t chunk; char *data; size_t skip = 0; + if (isize - file_pos > PAGE_CACHE_SIZE) chunk = PAGE_CACHE_SIZE; else @@ -792,6 +807,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, int reiserfs_removexattr(struct dentry *dentry, const char *name) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) @@ -813,9 +829,11 @@ static int listxattr_filler(void *buf, const char *name, int namelen, { struct listxattr_buf *b = (struct listxattr_buf *)buf; size_t size; + if (name[0] != '.' || (namelen != 1 && (name[1] != '.' || namelen != 2))) { const struct xattr_handler *handler; + handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); if (!handler) /* Unsupported xattr name */ @@ -885,6 +903,7 @@ static int create_privroot(struct dentry *dentry) { int err; struct inode *inode = dentry->d_parent->d_inode; + WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); @@ -1015,6 +1034,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) mutex_lock(&privroot->d_inode->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; + dentry = lookup_one_len(XAROOT_NAME, privroot, strlen(XAROOT_NAME)); if (!IS_ERR(dentry)) diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 44503e293790..4b34b9dc03dd 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -9,7 +9,7 @@ #include <linux/posix_acl_xattr.h> #include "xattr.h" #include "acl.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, int type, diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index 800a3cef6f62..e7f8939a4cb5 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -6,7 +6,7 @@ #include <linux/slab.h> #include "xattr.h" #include <linux/security.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a0035719f66b..5eeb0c48ba46 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -5,7 +5,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 8667491ae7c3..e50eab046471 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -4,7 +4,7 @@ #include <linux/pagemap.h> #include <linux/xattr.h> #include "xattr.h" -#include <asm/uaccess.h> +#include <linux/uaccess.h> static int user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, diff --git a/fs/romfs/super.c b/fs/romfs/super.c index ef90e8bca95a..e98dd88197d5 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -56,6 +56,8 @@ * 2 of the Licence, or (at your option) any later version. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> @@ -380,7 +382,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) eio: ret = -EIO; error: - printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos); + pr_err("read error for inode 0x%lx\n", pos); return ERR_PTR(ret); } @@ -390,6 +392,7 @@ error: static struct inode *romfs_alloc_inode(struct super_block *sb) { struct romfs_inode_info *inode; + inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL); return inode ? &inode->vfs_inode : NULL; } @@ -400,6 +403,7 @@ static struct inode *romfs_alloc_inode(struct super_block *sb) static void romfs_i_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode)); } @@ -507,15 +511,13 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 || img_size < ROMFH_SIZE) { if (!silent) - printk(KERN_WARNING "VFS:" - " Can't find a romfs filesystem on dev %s.\n", + pr_warn("VFS: Can't find a romfs filesystem on dev %s.\n", sb->s_id); goto error_rsb_inval; } if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) { - printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n", - sb->s_id); + pr_err("bad initial checksum on dev %s.\n", sb->s_id); goto error_rsb_inval; } @@ -523,8 +525,8 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) len = strnlen(rsb->name, ROMFS_MAXFN); if (!silent) - printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n", - (unsigned) len, (unsigned) len, rsb->name, storage); + pr_notice("Mounting image '%*.*s' through %s\n", + (unsigned) len, (unsigned) len, rsb->name, storage); kfree(rsb); rsb = NULL; @@ -614,7 +616,7 @@ static int __init init_romfs_fs(void) { int ret; - printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n"); + pr_info("ROMFS MTD (C) 2007 Red Hat, Inc.\n"); romfs_inode_cachep = kmem_cache_create("romfs_i", @@ -623,13 +625,12 @@ static int __init init_romfs_fs(void) romfs_i_init_once); if (!romfs_inode_cachep) { - printk(KERN_ERR - "ROMFS error: Failed to initialise inode cache\n"); + pr_err("Failed to initialise inode cache\n"); return -ENOMEM; } ret = register_filesystem(&romfs_fs_type); if (ret) { - printk(KERN_ERR "ROMFS error: Failed to register filesystem\n"); + pr_err("Failed to register filesystem\n"); goto error_register; } return 0; diff --git a/fs/seq_file.c b/fs/seq_file.c index 3857b720cb1b..1d641bb108d2 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -8,10 +8,8 @@ #include <linux/fs.h> #include <linux/export.h> #include <linux/seq_file.h> -#include <linux/vmalloc.h> #include <linux/slab.h> #include <linux/cred.h> -#include <linux/mm.h> #include <asm/uaccess.h> #include <asm/page.h> @@ -32,16 +30,6 @@ static void seq_set_overflow(struct seq_file *m) m->count = m->size; } -static void *seq_buf_alloc(unsigned long size) -{ - void *buf; - - buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; -} - /** * seq_open - initialize sequential file * @file: file we initialize @@ -108,7 +96,7 @@ static int traverse(struct seq_file *m, loff_t offset) return 0; } if (!m->buf) { - m->buf = seq_buf_alloc(m->size = PAGE_SIZE); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); if (!m->buf) return -ENOMEM; } @@ -147,9 +135,9 @@ static int traverse(struct seq_file *m, loff_t offset) Eoverflow: m->op->stop(m, p); - kvfree(m->buf); + kfree(m->buf); m->count = 0; - m->buf = seq_buf_alloc(m->size <<= 1); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); return !m->buf ? -ENOMEM : -EAGAIN; } @@ -204,7 +192,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) /* grab buffer if we didn't have one */ if (!m->buf) { - m->buf = seq_buf_alloc(m->size = PAGE_SIZE); + m->buf = kmalloc(m->size = PAGE_SIZE, GFP_KERNEL); if (!m->buf) goto Enomem; } @@ -244,9 +232,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (m->count < m->size) goto Fill; m->op->stop(m, p); - kvfree(m->buf); + kfree(m->buf); m->count = 0; - m->buf = seq_buf_alloc(m->size <<= 1); + m->buf = kmalloc(m->size <<= 1, GFP_KERNEL); if (!m->buf) goto Enomem; m->version = 0; @@ -362,7 +350,7 @@ EXPORT_SYMBOL(seq_lseek); int seq_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; - kvfree(m->buf); + kfree(m->buf); kfree(m); return 0; } @@ -617,13 +605,13 @@ EXPORT_SYMBOL(single_open); int single_open_size(struct file *file, int (*show)(struct seq_file *, void *), void *data, size_t size) { - char *buf = seq_buf_alloc(size); + char *buf = kmalloc(size, GFP_KERNEL); int ret; if (!buf) return -ENOMEM; ret = single_open(file, show, data); if (ret) { - kvfree(buf); + kfree(buf); return ret; } ((struct seq_file *)file->private_data)->buf = buf; diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c index 62a0de6632e1..43e7a7eddac0 100644 --- a/fs/squashfs/file_direct.c +++ b/fs/squashfs/file_direct.c @@ -44,7 +44,7 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize) pages = end_index - start_index + 1; - page = kmalloc(sizeof(void *) * pages, GFP_KERNEL); + page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL); if (page == NULL) return res; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 031c8d67fd51..5056babe00df 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -27,6 +27,8 @@ * the filesystem. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> @@ -448,8 +450,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " - "Phillip Lougher\n"); + pr_info("version 4.0 (2009/01/31) Phillip Lougher\n"); return 0; } diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 2290d5866725..fb08b0c514b6 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -431,7 +431,7 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) /** * wbuf_timer_callback - write-buffer timer callback function. - * @data: timer data (write-buffer descriptor) + * @timer: timer data (write-buffer descriptor) * * This function is called when the write-buffer timer expires. */ diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index c14adb2f420c..e2562961dd61 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -596,7 +596,6 @@ static void drop_last_group(struct ubifs_scan_leb *sleb, int *offs) * drop_last_node - drop the last node. * @sleb: scanned LEB information * @offs: offset of dropped nodes is returned here - * @grouped: non-zero if whole group of nodes have to be dropped * * This is a helper function for 'ubifs_recover_leb()' which drops the last * node of the scanned LEB. diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 3904c8574ef9..e5bd068cdb6c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -424,19 +424,19 @@ static int ubifs_show_options(struct seq_file *s, struct dentry *root) struct ubifs_info *c = root->d_sb->s_fs_info; if (c->mount_opts.unmount_mode == 2) - seq_printf(s, ",fast_unmount"); + seq_puts(s, ",fast_unmount"); else if (c->mount_opts.unmount_mode == 1) - seq_printf(s, ",norm_unmount"); + seq_puts(s, ",norm_unmount"); if (c->mount_opts.bulk_read == 2) - seq_printf(s, ",bulk_read"); + seq_puts(s, ",bulk_read"); else if (c->mount_opts.bulk_read == 1) - seq_printf(s, ",no_bulk_read"); + seq_puts(s, ",no_bulk_read"); if (c->mount_opts.chk_data_crc == 2) - seq_printf(s, ",chk_data_crc"); + seq_puts(s, ",chk_data_crc"); else if (c->mount_opts.chk_data_crc == 1) - seq_printf(s, ",no_chk_data_crc"); + seq_puts(s, ",no_chk_data_crc"); if (c->mount_opts.override_compr) { seq_printf(s, ",compr=%s", diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index dd39980437fc..4d0e02b022b3 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -6,3 +6,4 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ namei.o super.o symlink.o truncate.o util.o +ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 61e8a9b021dd..7c580c97990e 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -158,16 +158,16 @@ out: /** * ufs_inode_getfrag() - allocate new fragment(s) - * @inode - pointer to inode - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @fragment: number of `fragment' which hold pointer * to new allocated fragment(s) - * @new_fragment - number of new allocated fragment(s) - * @required - how many fragment(s) we require - * @err - we set it if something wrong - * @phys - pointer to where we save physical number of new allocated fragments, + * @new_fragment: number of new allocated fragment(s) + * @required: how many fragment(s) we require + * @err: we set it if something wrong + * @phys: pointer to where we save physical number of new allocated fragments, * NULL if we allocate not data(indirect blocks for example). - * @new - we set it if we allocate new block - * @locked_page - for ufs_new_fragments() + * @new: we set it if we allocate new block + * @locked_page: for ufs_new_fragments() */ static struct buffer_head * ufs_inode_getfrag(struct inode *inode, u64 fragment, @@ -315,16 +315,16 @@ repeat2: /** * ufs_inode_getblock() - allocate new block - * @inode - pointer to inode - * @bh - pointer to block which hold "pointer" to new allocated block - * @fragment - number of `fragment' which hold pointer + * @inode: pointer to inode + * @bh: pointer to block which hold "pointer" to new allocated block + * @fragment: number of `fragment' which hold pointer * to new allocated block - * @new_fragment - number of new allocated fragment + * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) - * @err - see ufs_inode_getfrag() - * @phys - see ufs_inode_getfrag() - * @new - see ufs_inode_getfrag() - * @locked_page - see ufs_inode_getfrag() + * @err: see ufs_inode_getfrag() + * @phys: see ufs_inode_getfrag() + * @new: see ufs_inode_getfrag() + * @locked_page: see ufs_inode_getfrag() */ static struct buffer_head * ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index b879f1ba3439..da73801301d5 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -65,7 +65,6 @@ * Evgeniy Dushistov <dushistov@mail.ru>, 2007 */ - #include <linux/exportfs.h> #include <linux/module.h> #include <linux/bitops.h> @@ -172,73 +171,73 @@ static void ufs_print_super_stuff(struct super_block *sb, { u32 magic = fs32_to_cpu(sb, usb3->fs_magic); - printk("ufs_print_super_stuff\n"); - printk(" magic: 0x%x\n", magic); + pr_debug("ufs_print_super_stuff\n"); + pr_debug(" magic: 0x%x\n", magic); if (fs32_to_cpu(sb, usb3->fs_magic) == UFS2_MAGIC) { - printk(" fs_size: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); - printk(" fs_dsize: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); - printk(" bsize: %u\n", - fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", - fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); - printk(" fs_sblockloc: %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); - printk(" cs_ndir(No of dirs): %llu\n", (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); - printk(" cs_nbfree(No of free blocks): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); - printk(KERN_INFO" cs_nifree(Num of free inodes): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); - printk(KERN_INFO" cs_nffree(Num of free frags): %llu\n", - (unsigned long long) - fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); - printk(KERN_INFO" fs_maxsymlinklen: %u\n", - fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); + pr_debug(" fs_size: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_size)); + pr_debug(" fs_dsize: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.fs_dsize)); + pr_debug(" bsize: %u\n", + fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", + fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" fs_volname: %s\n", usb2->fs_un.fs_u2.fs_volname); + pr_debug(" fs_sblockloc: %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.fs_sblockloc)); + pr_debug(" cs_ndir(No of dirs): %llu\n", (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_ndir)); + pr_debug(" cs_nbfree(No of free blocks): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb2->fs_un.fs_u2.cs_nbfree)); + pr_info(" cs_nifree(Num of free inodes): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nifree)); + pr_info(" cs_nffree(Num of free frags): %llu\n", + (unsigned long long) + fs64_to_cpu(sb, usb3->fs_un1.fs_u2.cs_nffree)); + pr_info(" fs_maxsymlinklen: %u\n", + fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen)); } else { - printk(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); - printk(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); - printk(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); - printk(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); - printk(" cgoffset: %u\n", - fs32_to_cpu(sb, usb1->fs_cgoffset)); - printk(" ~cgmask: 0x%x\n", - ~fs32_to_cpu(sb, usb1->fs_cgmask)); - printk(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); - printk(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); - printk(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); - printk(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); - printk(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); - printk(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); - printk(" fragshift: %u\n", - fs32_to_cpu(sb, usb1->fs_fragshift)); - printk(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); - printk(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); - printk(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); - printk(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); - printk(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); - printk(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); - printk(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); - printk(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); - printk(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); - printk(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); - printk(" fstodb: %u\n", - fs32_to_cpu(sb, usb1->fs_fsbtodb)); - printk(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); - printk(" ndir %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); - printk(" nifree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); - printk(" nbfree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); - printk(" nffree %u\n", - fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); + pr_debug(" sblkno: %u\n", fs32_to_cpu(sb, usb1->fs_sblkno)); + pr_debug(" cblkno: %u\n", fs32_to_cpu(sb, usb1->fs_cblkno)); + pr_debug(" iblkno: %u\n", fs32_to_cpu(sb, usb1->fs_iblkno)); + pr_debug(" dblkno: %u\n", fs32_to_cpu(sb, usb1->fs_dblkno)); + pr_debug(" cgoffset: %u\n", + fs32_to_cpu(sb, usb1->fs_cgoffset)); + pr_debug(" ~cgmask: 0x%x\n", + ~fs32_to_cpu(sb, usb1->fs_cgmask)); + pr_debug(" size: %u\n", fs32_to_cpu(sb, usb1->fs_size)); + pr_debug(" dsize: %u\n", fs32_to_cpu(sb, usb1->fs_dsize)); + pr_debug(" ncg: %u\n", fs32_to_cpu(sb, usb1->fs_ncg)); + pr_debug(" bsize: %u\n", fs32_to_cpu(sb, usb1->fs_bsize)); + pr_debug(" fsize: %u\n", fs32_to_cpu(sb, usb1->fs_fsize)); + pr_debug(" frag: %u\n", fs32_to_cpu(sb, usb1->fs_frag)); + pr_debug(" fragshift: %u\n", + fs32_to_cpu(sb, usb1->fs_fragshift)); + pr_debug(" ~fmask: %u\n", ~fs32_to_cpu(sb, usb1->fs_fmask)); + pr_debug(" fshift: %u\n", fs32_to_cpu(sb, usb1->fs_fshift)); + pr_debug(" sbsize: %u\n", fs32_to_cpu(sb, usb1->fs_sbsize)); + pr_debug(" spc: %u\n", fs32_to_cpu(sb, usb1->fs_spc)); + pr_debug(" cpg: %u\n", fs32_to_cpu(sb, usb1->fs_cpg)); + pr_debug(" ipg: %u\n", fs32_to_cpu(sb, usb1->fs_ipg)); + pr_debug(" fpg: %u\n", fs32_to_cpu(sb, usb1->fs_fpg)); + pr_debug(" csaddr: %u\n", fs32_to_cpu(sb, usb1->fs_csaddr)); + pr_debug(" cssize: %u\n", fs32_to_cpu(sb, usb1->fs_cssize)); + pr_debug(" cgsize: %u\n", fs32_to_cpu(sb, usb1->fs_cgsize)); + pr_debug(" fstodb: %u\n", + fs32_to_cpu(sb, usb1->fs_fsbtodb)); + pr_debug(" nrpos: %u\n", fs32_to_cpu(sb, usb3->fs_nrpos)); + pr_debug(" ndir %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_ndir)); + pr_debug(" nifree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nifree)); + pr_debug(" nbfree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nbfree)); + pr_debug(" nffree %u\n", + fs32_to_cpu(sb, usb1->fs_cstotal.cs_nffree)); } - printk("\n"); + pr_debug("\n"); } /* @@ -247,38 +246,38 @@ static void ufs_print_super_stuff(struct super_block *sb, static void ufs_print_cylinder_stuff(struct super_block *sb, struct ufs_cylinder_group *cg) { - printk("\nufs_print_cylinder_stuff\n"); - printk("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); - printk(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); - printk(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); - printk(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); - printk(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); - printk(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); - printk(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); - printk(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); - printk(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); - printk(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); - printk(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); - printk(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); - printk(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); - printk(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); - printk(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", + pr_debug("\nufs_print_cylinder_stuff\n"); + pr_debug("size of ucg: %zu\n", sizeof(struct ufs_cylinder_group)); + pr_debug(" magic: %x\n", fs32_to_cpu(sb, cg->cg_magic)); + pr_debug(" time: %u\n", fs32_to_cpu(sb, cg->cg_time)); + pr_debug(" cgx: %u\n", fs32_to_cpu(sb, cg->cg_cgx)); + pr_debug(" ncyl: %u\n", fs16_to_cpu(sb, cg->cg_ncyl)); + pr_debug(" niblk: %u\n", fs16_to_cpu(sb, cg->cg_niblk)); + pr_debug(" ndblk: %u\n", fs32_to_cpu(sb, cg->cg_ndblk)); + pr_debug(" cs_ndir: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_ndir)); + pr_debug(" cs_nbfree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nbfree)); + pr_debug(" cs_nifree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nifree)); + pr_debug(" cs_nffree: %u\n", fs32_to_cpu(sb, cg->cg_cs.cs_nffree)); + pr_debug(" rotor: %u\n", fs32_to_cpu(sb, cg->cg_rotor)); + pr_debug(" frotor: %u\n", fs32_to_cpu(sb, cg->cg_frotor)); + pr_debug(" irotor: %u\n", fs32_to_cpu(sb, cg->cg_irotor)); + pr_debug(" frsum: %u, %u, %u, %u, %u, %u, %u, %u\n", fs32_to_cpu(sb, cg->cg_frsum[0]), fs32_to_cpu(sb, cg->cg_frsum[1]), fs32_to_cpu(sb, cg->cg_frsum[2]), fs32_to_cpu(sb, cg->cg_frsum[3]), fs32_to_cpu(sb, cg->cg_frsum[4]), fs32_to_cpu(sb, cg->cg_frsum[5]), fs32_to_cpu(sb, cg->cg_frsum[6]), fs32_to_cpu(sb, cg->cg_frsum[7])); - printk(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); - printk(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); - printk(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); - printk(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); - printk(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); - printk(" clustersumoff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); - printk(" clusteroff %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); - printk(" nclusterblks %u\n", - fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); - printk("\n"); + pr_debug(" btotoff: %u\n", fs32_to_cpu(sb, cg->cg_btotoff)); + pr_debug(" boff: %u\n", fs32_to_cpu(sb, cg->cg_boff)); + pr_debug(" iuseoff: %u\n", fs32_to_cpu(sb, cg->cg_iusedoff)); + pr_debug(" freeoff: %u\n", fs32_to_cpu(sb, cg->cg_freeoff)); + pr_debug(" nextfreeoff: %u\n", fs32_to_cpu(sb, cg->cg_nextfreeoff)); + pr_debug(" clustersumoff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clustersumoff)); + pr_debug(" clusteroff %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_clusteroff)); + pr_debug(" nclusterblks %u\n", + fs32_to_cpu(sb, cg->cg_u.cg_44.cg_nclusterblks)); + pr_debug("\n"); } #else # define ufs_print_super_stuff(sb, usb1, usb2, usb3) /**/ @@ -287,13 +286,12 @@ static void ufs_print_cylinder_stuff(struct super_block *sb, static const struct super_operations ufs_super_ops; -static char error_buf[1024]; - void ufs_error (struct super_block * sb, const char * function, const char * fmt, ...) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -305,20 +303,21 @@ void ufs_error (struct super_block * sb, const char * function, ufs_mark_sb_dirty(sb); sb->s_flags |= MS_RDONLY; } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { case UFS_MOUNT_ONERROR_PANIC: - panic ("UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + panic("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); case UFS_MOUNT_ONERROR_LOCK: case UFS_MOUNT_ONERROR_UMOUNT: case UFS_MOUNT_ONERROR_REPAIR: - printk (KERN_CRIT "UFS-fs error (device %s): %s: %s\n", - sb->s_id, function, error_buf); - } + pr_crit("error (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + } + va_end(args); } void ufs_panic (struct super_block * sb, const char * function, @@ -326,6 +325,7 @@ void ufs_panic (struct super_block * sb, const char * function, { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; + struct va_format vaf; va_list args; uspi = UFS_SB(sb)->s_uspi; @@ -336,24 +336,27 @@ void ufs_panic (struct super_block * sb, const char * function, ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); } - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; sb->s_flags |= MS_RDONLY; - printk (KERN_CRIT "UFS-fs panic (device %s): %s: %s\n", - sb->s_id, function, error_buf); + pr_crit("panic (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } void ufs_warning (struct super_block * sb, const char * function, const char * fmt, ...) { + struct va_format vaf; va_list args; - va_start (args, fmt); - vsnprintf (error_buf, sizeof(error_buf), fmt, args); - va_end (args); - printk (KERN_WARNING "UFS-fs warning (device %s): %s: %s\n", - sb->s_id, function, error_buf); + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + pr_warn("(device %s): %s: %pV\n", + sb->s_id, function, &vaf); + va_end(args); } enum { @@ -464,14 +467,12 @@ static int ufs_parse_options (char * options, unsigned * mount_options) ufs_set_opt (*mount_options, ONERROR_UMOUNT); break; case Opt_onerror_repair: - printk("UFS-fs: Unable to do repair on error, " - "will lock lock instead\n"); + pr_err("Unable to do repair on error, will lock lock instead\n"); ufs_clear_opt (*mount_options, ONERROR); ufs_set_opt (*mount_options, ONERROR_REPAIR); break; default: - printk("UFS-fs: Invalid option: \"%s\" " - "or missing value\n", p); + pr_err("Invalid option: \"%s\" or missing value\n", p); return 0; } } @@ -788,8 +789,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) #ifndef CONFIG_UFS_FS_WRITE if (!(sb->s_flags & MS_RDONLY)) { - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); return -EROFS; } #endif @@ -812,12 +812,12 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { - printk("wrong mount options\n"); + pr_err("wrong mount options\n"); goto failed; } if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { if (!silent) - printk("You didn't specify the type of your ufs filesystem\n\n" + pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " @@ -868,7 +868,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) break; case UFS_MOUNT_UFSTYPE_SUNOS: - UFSD(("ufstype=sunos\n")) + UFSD("ufstype=sunos\n"); uspi->s_fsize = block_size = 1024; uspi->s_fmask = ~(1024 - 1); uspi->s_fshift = 10; @@ -900,7 +900,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=old is supported read-only\n"); + pr_info("ufstype=old is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -916,7 +916,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep is supported read-only\n"); + pr_info("ufstype=nextstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -932,7 +932,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=nextstep-cd is supported read-only\n"); + pr_info("ufstype=nextstep-cd is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -948,7 +948,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_44BSD | UFS_UID_44BSD | UFS_ST_44BSD | UFS_CG_44BSD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=openstep is supported read-only\n"); + pr_info("ufstype=openstep is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; @@ -963,19 +963,19 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) flags |= UFS_DE_OLD | UFS_UID_OLD | UFS_ST_OLD | UFS_CG_OLD; if (!(sb->s_flags & MS_RDONLY)) { if (!silent) - printk(KERN_INFO "ufstype=hp is supported read-only\n"); + pr_info("ufstype=hp is supported read-only\n"); sb->s_flags |= MS_RDONLY; } break; default: if (!silent) - printk("unknown ufstype\n"); + pr_err("unknown ufstype\n"); goto failed; } again: if (!sb_set_blocksize(sb, block_size)) { - printk(KERN_ERR "UFS: failed to set blocksize\n"); + pr_err("failed to set blocksize\n"); goto failed; } @@ -1034,7 +1034,7 @@ again: goto again; } if (!silent) - printk("ufs_read_super: bad magic number\n"); + pr_err("%s(): bad magic number\n", __func__); goto failed; magic_found: @@ -1048,33 +1048,33 @@ magic_found: uspi->s_fshift = fs32_to_cpu(sb, usb1->fs_fshift); if (!is_power_of_2(uspi->s_fsize)) { - printk(KERN_ERR "ufs_read_super: fragment size %u is not a power of 2\n", - uspi->s_fsize); - goto failed; + pr_err("%s(): fragment size %u is not a power of 2\n", + __func__, uspi->s_fsize); + goto failed; } if (uspi->s_fsize < 512) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too small\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too small\n", + __func__, uspi->s_fsize); goto failed; } if (uspi->s_fsize > 4096) { - printk(KERN_ERR "ufs_read_super: fragment size %u is too large\n", - uspi->s_fsize); + pr_err("%s(): fragment size %u is too large\n", + __func__, uspi->s_fsize); goto failed; } if (!is_power_of_2(uspi->s_bsize)) { - printk(KERN_ERR "ufs_read_super: block size %u is not a power of 2\n", - uspi->s_bsize); + pr_err("%s(): block size %u is not a power of 2\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize < 4096) { - printk(KERN_ERR "ufs_read_super: block size %u is too small\n", - uspi->s_bsize); + pr_err("%s(): block size %u is too small\n", + __func__, uspi->s_bsize); goto failed; } if (uspi->s_bsize / uspi->s_fsize > 8) { - printk(KERN_ERR "ufs_read_super: too many fragments per block (%u)\n", - uspi->s_bsize / uspi->s_fsize); + pr_err("%s(): too many fragments per block (%u)\n", + __func__, uspi->s_bsize / uspi->s_fsize); goto failed; } if (uspi->s_fsize != block_size || uspi->s_sbsize != super_block_size) { @@ -1113,20 +1113,21 @@ magic_found: UFSD("fs is DEC OSF/1\n"); break; case UFS_FSACTIVE: - printk("ufs_read_super: fs is active\n"); + pr_err("%s(): fs is active\n", __func__); sb->s_flags |= MS_RDONLY; break; case UFS_FSBAD: - printk("ufs_read_super: fs is bad\n"); + pr_err("%s(): fs is bad\n", __func__); sb->s_flags |= MS_RDONLY; break; default: - printk("ufs_read_super: can't grok fs_clean 0x%x\n", usb1->fs_clean); + pr_err("%s(): can't grok fs_clean 0x%x\n", + __func__, usb1->fs_clean); sb->s_flags |= MS_RDONLY; break; } } else { - printk("ufs_read_super: fs needs fsck\n"); + pr_err("%s(): fs needs fsck\n", __func__); sb->s_flags |= MS_RDONLY; } @@ -1299,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { - printk("ufstype can't be changed during remount\n"); + pr_err("ufstype can't be changed during remount\n"); unlock_ufs(sb); return -EINVAL; } @@ -1328,8 +1329,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) * fs was mounted as ro, remounting rw */ #ifndef CONFIG_UFS_FS_WRITE - printk("ufs was compiled with read-only support, " - "can't be mounted as read-write\n"); + pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); unlock_ufs(sb); return -EINVAL; #else @@ -1338,12 +1338,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_44BSD && ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { - printk("this ufstype is read-only supported\n"); + pr_err("this ufstype is read-only supported\n"); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { - printk("failed during remounting\n"); + pr_err("failed during remounting\n"); unlock_ufs(sb); return -EPERM; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 343e6fc571e5..2a07396d5f9e 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -1,6 +1,12 @@ #ifndef _UFS_UFS_H #define _UFS_UFS_H 1 +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #define UFS_MAX_GROUP_LOADED 8 #define UFS_CGNO_EMPTY ((unsigned)-1) @@ -71,9 +77,9 @@ struct ufs_inode_info { */ #ifdef CONFIG_UFS_DEBUG # define UFSD(f, a...) { \ - printk ("UFSD (%s, %d): %s:", \ + pr_debug("UFSD (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ - printk (f, ## a); \ + pr_debug(f, ## a); \ } #else # define UFSD(f, a...) /**/ diff --git a/fs/xattr.c b/fs/xattr.c index 3377dff18404..c69e6d43a0d2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -843,7 +843,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) /* wrap around? */ len = sizeof(*new_xattr) + size; - if (len <= sizeof(*new_xattr)) + if (len < sizeof(*new_xattr)) return NULL; new_xattr = kmalloc(len, GFP_KERNEL); diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 7ad634501e48..e1c8d080c427 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -88,32 +88,32 @@ * lib/bitmap.c provides these functions: */ -extern int __bitmap_empty(const unsigned long *bitmap, int bits); -extern int __bitmap_full(const unsigned long *bitmap, int bits); +extern int __bitmap_empty(const unsigned long *bitmap, unsigned int nbits); +extern int __bitmap_full(const unsigned long *bitmap, unsigned int nbits); extern int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_complement(unsigned long *dst, const unsigned long *src, - int bits); + unsigned int nbits); extern void __bitmap_shift_right(unsigned long *dst, const unsigned long *src, int shift, int bits); extern void __bitmap_shift_left(unsigned long *dst, const unsigned long *src, int shift, int bits); extern int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); + const unsigned long *bitmap2, unsigned int nbits); extern int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits); -extern int __bitmap_weight(const unsigned long *bitmap, int bits); + const unsigned long *bitmap2, unsigned int nbits); +extern int __bitmap_weight(const unsigned long *bitmap, unsigned int nbits); -extern void bitmap_set(unsigned long *map, int i, int len); -extern void bitmap_clear(unsigned long *map, int start, int nr); +extern void bitmap_set(unsigned long *map, unsigned int start, int len); +extern void bitmap_clear(unsigned long *map, unsigned int start, int len); extern unsigned long bitmap_find_next_zero_area(unsigned long *map, unsigned long size, unsigned long start, @@ -140,9 +140,9 @@ extern void bitmap_onto(unsigned long *dst, const unsigned long *orig, const unsigned long *relmap, int bits); extern void bitmap_fold(unsigned long *dst, const unsigned long *orig, int sz, int bits); -extern int bitmap_find_free_region(unsigned long *bitmap, int bits, int order); -extern void bitmap_release_region(unsigned long *bitmap, int pos, int order); -extern int bitmap_allocate_region(unsigned long *bitmap, int pos, int order); +extern int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order); +extern void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order); +extern int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order); extern void bitmap_copy_le(void *dst, const unsigned long *src, int nbits); extern int bitmap_ord_to_pos(const unsigned long *bitmap, int n, int bits); @@ -188,15 +188,15 @@ static inline void bitmap_copy(unsigned long *dst, const unsigned long *src, } static inline int bitmap_and(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & *src2) != 0; + return (*dst = *src1 & *src2 & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_and(dst, src1, src2, nbits); } static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 | *src2; @@ -205,7 +205,7 @@ static inline void bitmap_or(unsigned long *dst, const unsigned long *src1, } static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) *dst = *src1 ^ *src2; @@ -214,24 +214,24 @@ static inline void bitmap_xor(unsigned long *dst, const unsigned long *src1, } static inline int bitmap_andnot(unsigned long *dst, const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) - return (*dst = *src1 & ~(*src2)) != 0; + return (*dst = *src1 & ~(*src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; return __bitmap_andnot(dst, src1, src2, nbits); } static inline void bitmap_complement(unsigned long *dst, const unsigned long *src, - int nbits) + unsigned int nbits) { if (small_const_nbits(nbits)) - *dst = ~(*src) & BITMAP_LAST_WORD_MASK(nbits); + *dst = ~(*src); else __bitmap_complement(dst, src, nbits); } static inline int bitmap_equal(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 ^ *src2) & BITMAP_LAST_WORD_MASK(nbits)); @@ -240,7 +240,7 @@ static inline int bitmap_equal(const unsigned long *src1, } static inline int bitmap_intersects(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ((*src1 & *src2) & BITMAP_LAST_WORD_MASK(nbits)) != 0; @@ -249,7 +249,7 @@ static inline int bitmap_intersects(const unsigned long *src1, } static inline int bitmap_subset(const unsigned long *src1, - const unsigned long *src2, int nbits) + const unsigned long *src2, unsigned int nbits) { if (small_const_nbits(nbits)) return ! ((*src1 & ~(*src2)) & BITMAP_LAST_WORD_MASK(nbits)); @@ -257,7 +257,7 @@ static inline int bitmap_subset(const unsigned long *src1, return __bitmap_subset(src1, src2, nbits); } -static inline int bitmap_empty(const unsigned long *src, int nbits) +static inline int bitmap_empty(const unsigned long *src, unsigned nbits) { if (small_const_nbits(nbits)) return ! (*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -265,7 +265,7 @@ static inline int bitmap_empty(const unsigned long *src, int nbits) return __bitmap_empty(src, nbits); } -static inline int bitmap_full(const unsigned long *src, int nbits) +static inline int bitmap_full(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return ! (~(*src) & BITMAP_LAST_WORD_MASK(nbits)); @@ -273,7 +273,7 @@ static inline int bitmap_full(const unsigned long *src, int nbits) return __bitmap_full(src, nbits); } -static inline int bitmap_weight(const unsigned long *src, int nbits) +static inline int bitmap_weight(const unsigned long *src, unsigned int nbits) { if (small_const_nbits(nbits)) return hweight_long(*src & BITMAP_LAST_WORD_MASK(nbits)); @@ -284,7 +284,7 @@ static inline void bitmap_shift_right(unsigned long *dst, const unsigned long *src, int n, int nbits) { if (small_const_nbits(nbits)) - *dst = *src >> n; + *dst = (*src & BITMAP_LAST_WORD_MASK(nbits)) >> n; else __bitmap_shift_right(dst, src, n, nbits); } diff --git a/include/linux/byteorder/generic.h b/include/linux/byteorder/generic.h index 0846e6b931ce..89f67c1c3160 100644 --- a/include/linux/byteorder/generic.h +++ b/include/linux/byteorder/generic.h @@ -2,7 +2,7 @@ #define _LINUX_BYTEORDER_GENERIC_H /* - * linux/byteorder_generic.h + * linux/byteorder/generic.h * Generic Byte-reordering support * * The "... p" macros, like le64_to_cpup, can be used with pointers diff --git a/include/linux/cma.h b/include/linux/cma.h new file mode 100644 index 000000000000..371b93042520 --- /dev/null +++ b/include/linux/cma.h @@ -0,0 +1,27 @@ +#ifndef __CMA_H__ +#define __CMA_H__ + +/* + * There is always at least global CMA area and a few optional + * areas configured in kernel .config. + */ +#ifdef CONFIG_CMA_AREAS +#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) + +#else +#define MAX_CMA_AREAS (0) + +#endif + +struct cma; + +extern phys_addr_t cma_get_base(struct cma *cma); +extern unsigned long cma_get_size(struct cma *cma); + +extern int __init cma_declare_contiguous(phys_addr_t size, + phys_addr_t base, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma); +extern struct page *cma_alloc(struct cma *cma, int count, unsigned int align); +extern bool cma_release(struct cma *cma, struct page *pages, int count); +#endif diff --git a/include/linux/crc64_ecma.h b/include/linux/crc64_ecma.h new file mode 100644 index 000000000000..bba7a4d692b3 --- /dev/null +++ b/include/linux/crc64_ecma.h @@ -0,0 +1,56 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __CRC64_ECMA_H_ +#define __CRC64_ECMA_H_ + +#include <linux/types.h> + + +#define CRC64_DEFAULT_INITVAL 0xFFFFFFFFFFFFFFFFULL + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * @pdata: pointer to the data to compute checksum for. + * @nbytes: number of bytes in data buffer. + * @seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed); + +#endif /* __CRC64_ECMA_H_ */ diff --git a/include/linux/decompress/bunzip2.h b/include/linux/decompress/bunzip2.h index 115272137a9c..4d683df898e6 100644 --- a/include/linux/decompress/bunzip2.h +++ b/include/linux/decompress/bunzip2.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_BUNZIP2_H #define DECOMPRESS_BUNZIP2_H -int bunzip2(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int bunzip2(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h index 0c7111a55a1a..1fcfd64b5076 100644 --- a/include/linux/decompress/generic.h +++ b/include/linux/decompress/generic.h @@ -1,11 +1,11 @@ #ifndef DECOMPRESS_GENERIC_H #define DECOMPRESS_GENERIC_H -typedef int (*decompress_fn) (unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +typedef int (*decompress_fn) (unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *posp, + long *posp, void(*error)(char *x)); /* inbuf - input buffer @@ -33,7 +33,7 @@ typedef int (*decompress_fn) (unsigned char *inbuf, int len, /* Utility routine to detect the decompression method */ -decompress_fn decompress_method(const unsigned char *inbuf, int len, +decompress_fn decompress_method(const unsigned char *inbuf, long len, const char **name); #endif diff --git a/include/linux/decompress/inflate.h b/include/linux/decompress/inflate.h index 1d0aedef9822..e4f411fdbd24 100644 --- a/include/linux/decompress/inflate.h +++ b/include/linux/decompress/inflate.h @@ -1,10 +1,10 @@ #ifndef LINUX_DECOMPRESS_INFLATE_H #define LINUX_DECOMPRESS_INFLATE_H -int gunzip(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int gunzip(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error_fn)(char *x)); #endif diff --git a/include/linux/decompress/unlz4.h b/include/linux/decompress/unlz4.h index d5b68bf3ec92..3273c2f36496 100644 --- a/include/linux/decompress/unlz4.h +++ b/include/linux/decompress/unlz4.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_UNLZ4_H #define DECOMPRESS_UNLZ4_H -int unlz4(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlz4(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/unlzma.h b/include/linux/decompress/unlzma.h index 7796538f1bf4..8a891a193840 100644 --- a/include/linux/decompress/unlzma.h +++ b/include/linux/decompress/unlzma.h @@ -1,11 +1,11 @@ #ifndef DECOMPRESS_UNLZMA_H #define DECOMPRESS_UNLZMA_H -int unlzma(unsigned char *, int, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlzma(unsigned char *, long, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ); diff --git a/include/linux/decompress/unlzo.h b/include/linux/decompress/unlzo.h index 987229752519..af18f95d6570 100644 --- a/include/linux/decompress/unlzo.h +++ b/include/linux/decompress/unlzo.h @@ -1,10 +1,10 @@ #ifndef DECOMPRESS_UNLZO_H #define DECOMPRESS_UNLZO_H -int unlzo(unsigned char *inbuf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +int unlzo(unsigned char *inbuf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *pos, + long *pos, void(*error)(char *x)); #endif diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h index 41728fc6c8a1..f764e2a7201e 100644 --- a/include/linux/decompress/unxz.h +++ b/include/linux/decompress/unxz.h @@ -10,10 +10,10 @@ #ifndef DECOMPRESS_UNXZ_H #define DECOMPRESS_UNXZ_H -int unxz(unsigned char *in, int in_size, - int (*fill)(void *dest, unsigned int size), - int (*flush)(void *src, unsigned int size), - unsigned char *out, int *in_used, +int unxz(unsigned char *in, long in_size, + long (*fill)(void *dest, unsigned long size), + long (*flush)(void *src, unsigned long size), + unsigned char *out, long *in_used, void (*error)(char *x)); #endif diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 772eab5d524a..569bbd039896 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -53,18 +53,13 @@ #ifdef __KERNEL__ +#include <linux/device.h> + struct cma; struct page; -struct device; #ifdef CONFIG_DMA_CMA -/* - * There is always at least global CMA area and a few optional device - * private areas configured in kernel .config. - */ -#define MAX_CMA_AREAS (1 + CONFIG_CMA_AREAS) - extern struct cma *dma_contiguous_default_area; static inline struct cma *dev_get_cma_area(struct device *dev) @@ -123,8 +118,6 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #else -#define MAX_CMA_AREAS (0) - static inline struct cma *dev_get_cma_area(struct device *dev) { return NULL; diff --git a/include/linux/efi.h b/include/linux/efi.h index 41bbf8ba4ba8..1e191f8e5244 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1151,6 +1151,9 @@ int efivars_sysfs_init(void); #ifdef CONFIG_EFI_RUNTIME_MAP int efi_runtime_map_init(struct kobject *); void efi_runtime_map_setup(void *, int, u32); +int efi_get_runtime_map_size(void); +int efi_get_runtime_map_desc_size(void); +int efi_runtime_map_copy(void *buf, size_t bufsz); #else static inline int efi_runtime_map_init(struct kobject *kobj) { @@ -1159,6 +1162,22 @@ static inline int efi_runtime_map_init(struct kobject *kobj) static inline void efi_runtime_map_setup(void *map, int nr_entries, u32 desc_size) {} + +static inline int efi_get_runtime_map_size(void) +{ + return 0; +} + +static inline int efi_get_runtime_map_desc_size(void) +{ + return 0; +} + +static inline int efi_runtime_map_copy(void *buf, size_t bufsz) +{ + return 0; +} + #endif #endif /* _LINUX_EFI_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 2daccaf4b547..1ab6c6913040 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2688,7 +2688,7 @@ static const struct file_operations __fops = { \ .read = simple_attr_read, \ .write = simple_attr_write, \ .llseek = generic_file_llseek, \ -}; +} static inline __printf(1, 2) void __simple_attr_check_format(const char *fmt, ...) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 6eb1fb37de9a..5e7219dc0fae 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -360,7 +360,7 @@ extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); /* This is different from alloc_pages_exact_node !!! */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) diff --git a/include/linux/glob.h b/include/linux/glob.h new file mode 100644 index 000000000000..c990b7fbf0a8 --- /dev/null +++ b/include/linux/glob.h @@ -0,0 +1,10 @@ +#ifndef _LINUX_GLOB_H +#define _LINUX_GLOB_H + +#include <linux/types.h> /* For bool */ +#include <linux/compiler.h> /* For __pure */ + +bool __pure glob_match(char const *pat, char const *str); + +#endif /* _LINUX_GLOB_H */ + diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index b826239bdce0..63579cb8d3dc 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -93,10 +93,6 @@ extern bool is_vma_temporary_stack(struct vm_area_struct *vma); #endif /* CONFIG_DEBUG_VM */ extern unsigned long transparent_hugepage_flags; -extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, - struct vm_area_struct *vma, - unsigned long addr, unsigned long end); extern int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 255cd5cc0754..41272bcf73f8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -86,7 +86,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); #endif extern unsigned long hugepages_treat_as_movable; -extern const unsigned long hugetlb_zero, hugetlb_infinity; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/include/linux/input.h b/include/linux/input.h index 82ce323b9986..6453b22372ac 100644 --- a/include/linux/input.h +++ b/include/linux/input.h @@ -79,6 +79,7 @@ struct input_value { * @led: reflects current state of device's LEDs * @snd: reflects current state of sound effects * @sw: reflects current state of device's switches + * @leds: leds objects for the device's LEDs * @open: this method is called when the very first user calls * input_open_device(). The driver must prepare the device * to start generating events (start polling thread, @@ -164,6 +165,8 @@ struct input_dev { unsigned long snd[BITS_TO_LONGS(SND_CNT)]; unsigned long sw[BITS_TO_LONGS(SW_CNT)]; + struct led_classdev *leds; + int (*open)(struct input_dev *dev); void (*close)(struct input_dev *dev); int (*flush)(struct input_dev *dev, struct file *file); @@ -531,4 +534,22 @@ int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file); int input_ff_create_memless(struct input_dev *dev, void *data, int (*play_effect)(struct input_dev *, void *, struct ff_effect *)); +#ifdef CONFIG_INPUT_LEDS + +int input_led_connect(struct input_dev *dev); +void input_led_disconnect(struct input_dev *dev); + +#else + +static inline int input_led_connect(struct input_dev *dev) +{ + return 0; +} + +static inline void input_led_disconnect(struct input_dev *dev) +{ +} + +#endif + #endif diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 5e3a906cc089..142ec544167c 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -237,6 +237,12 @@ extern int iomem_is_exclusive(u64 addr); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); +extern int +walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); +extern int +walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a9e2268ecccb..e9892041cb41 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -470,6 +470,7 @@ extern enum system_states { #define TAINT_FIRMWARE_WORKAROUND 11 #define TAINT_OOT_MODULE 12 #define TAINT_UNSIGNED_MODULE 13 +#define TAINT_SOFTLOCKUP 14 extern const char hex_asc[]; #define hex_asc_lo(x) hex_asc[((x) & 0x0f)] @@ -493,11 +494,6 @@ static inline char *hex_byte_pack_upper(char *buf, u8 byte) return buf; } -static inline char * __deprecated pack_hex_byte(char *buf, u8 byte) -{ - return hex_byte_pack(buf, byte); -} - extern int hex_to_bin(char ch); extern int __must_check hex2bin(u8 *dst, const char *src, size_t count); @@ -719,23 +715,8 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } (void) (&_max1 == &_max2); \ _max1 > _max2 ? _max1 : _max2; }) -#define min3(x, y, z) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - typeof(z) _min3 = (z); \ - (void) (&_min1 == &_min2); \ - (void) (&_min1 == &_min3); \ - _min1 < _min2 ? (_min1 < _min3 ? _min1 : _min3) : \ - (_min2 < _min3 ? _min2 : _min3); }) - -#define max3(x, y, z) ({ \ - typeof(x) _max1 = (x); \ - typeof(y) _max2 = (y); \ - typeof(z) _max3 = (z); \ - (void) (&_max1 == &_max2); \ - (void) (&_max1 == &_max3); \ - _max1 > _max2 ? (_max1 > _max3 ? _max1 : _max3) : \ - (_max2 > _max3 ? _max2 : _max3); }) +#define min3(x, y, z) min((typeof(x))min(x, y), z) +#define max3(x, y, z) max((typeof(x))max(x, y), z) /** * min_not_zero - return the minimum that is _not_ zero, unless both are zero @@ -750,20 +731,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } /** * clamp - return a value clamped to a given range with strict typechecking * @val: current value - * @min: minimum allowable value - * @max: maximum allowable value + * @lo: lowest allowable value + * @hi: highest allowable value * * This macro does strict typechecking of min/max to make sure they are of the * same type as val. See the unnecessary pointer comparisons. */ -#define clamp(val, min, max) ({ \ - typeof(val) __val = (val); \ - typeof(min) __min = (min); \ - typeof(max) __max = (max); \ - (void) (&__val == &__min); \ - (void) (&__val == &__max); \ - __val = __val < __min ? __min: __val; \ - __val > __max ? __max: __val; }) +#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi) /* * ..and if you can't take the strict diff --git a/include/linux/kexec.h b/include/linux/kexec.h index a75641930049..9481703b0e7a 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -10,6 +10,7 @@ #include <linux/ioport.h> #include <linux/elfcore.h> #include <linux/elf.h> +#include <linux/module.h> #include <asm/kexec.h> /* Verify architecture specific macros are defined */ @@ -69,7 +70,18 @@ typedef unsigned long kimage_entry_t; #define IND_SOURCE 0x8 struct kexec_segment { - void __user *buf; + /* + * This pointer can point to user memory if kexec_load() system + * call is used or will point to kernel memory if + * kexec_file_load() system call is used. + * + * Use ->buf when expecting to deal with user memory and use ->kbuf + * when expecting to deal with kernel memory. + */ + union { + void __user *buf; + void *kbuf; + }; size_t bufsz; unsigned long mem; size_t memsz; @@ -84,6 +96,27 @@ struct compat_kexec_segment { }; #endif +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +struct purgatory_info { + /* Pointer to elf header of read only purgatory */ + Elf_Ehdr *ehdr; + + /* Pointer to purgatory sechdrs which are modifiable */ + Elf_Shdr *sechdrs; + /* + * Temporary buffer location where purgatory is loaded and relocated + * This memory can be freed post image load + */ + void *purgatory_buf; + + /* Address where purgatory is finally loaded and is executed from */ + unsigned long purgatory_load_addr; +}; + struct kimage { kimage_entry_t head; kimage_entry_t *entry; @@ -100,7 +133,7 @@ struct kimage { struct list_head control_pages; struct list_head dest_pages; - struct list_head unuseable_pages; + struct list_head unusable_pages; /* Address of next control page to allocate for crash kernels. */ unsigned long control_page; @@ -110,13 +143,60 @@ struct kimage { #define KEXEC_TYPE_DEFAULT 0 #define KEXEC_TYPE_CRASH 1 unsigned int preserve_context : 1; + /* If set, we are using file mode kexec syscall */ + unsigned int file_mode:1; #ifdef ARCH_HAS_KIMAGE_ARCH struct kimage_arch arch; #endif + + /* Additional fields for file based kexec syscall */ + void *kernel_buf; + unsigned long kernel_buf_len; + + void *initrd_buf; + unsigned long initrd_buf_len; + + char *cmdline_buf; + unsigned long cmdline_buf_len; + + /* File operations provided by image loader */ + struct kexec_file_ops *fops; + + /* Image loader handling the kernel can store a pointer here */ + void *image_loader_data; + + /* Information for loading purgatory */ + struct purgatory_info purgatory_info; }; +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; +typedef int (kexec_probe_t)(const char *kernel_buf, unsigned long kernel_size); +typedef void *(kexec_load_t)(struct kimage *image, char *kernel_buf, + unsigned long kernel_len, char *initrd, + unsigned long initrd_len, char *cmdline, + unsigned long cmdline_len); +typedef int (kexec_cleanup_t)(void *loader_data); + +struct kexec_file_ops { + kexec_probe_t *probe; + kexec_load_t *load; + kexec_cleanup_t *cleanup; +}; /* kexec interface functions */ extern void machine_kexec(struct kimage *image); @@ -127,8 +207,21 @@ extern asmlinkage long sys_kexec_load(unsigned long entry, struct kexec_segment __user *segments, unsigned long flags); extern int kernel_kexec(void); +extern int kexec_add_buffer(struct kimage *image, char *buffer, + unsigned long bufsz, unsigned long memsz, + unsigned long buf_align, unsigned long buf_min, + unsigned long buf_max, bool top_down, + unsigned long *load_addr); extern struct page *kimage_alloc_control_pages(struct kimage *image, unsigned int order); +extern int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr); +extern int kexec_purgatory_get_set_symbol(struct kimage *image, + const char *name, void *buf, + unsigned int size, bool get_value); +extern void *kexec_purgatory_get_symbol_addr(struct kimage *image, + const char *name); extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); void crash_save_cpu(struct pt_regs *regs, int cpu); @@ -177,6 +270,10 @@ extern int kexec_load_disabled; #define KEXEC_FLAGS (KEXEC_ON_CRASH | KEXEC_PRESERVE_CONTEXT) #endif +/* List of defined/legal kexec file flags */ +#define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ + KEXEC_FILE_NO_INITRAMFS) + #define VMCOREINFO_BYTES (4096) #define VMCOREINFO_NOTE_NAME "VMCOREINFO" #define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) diff --git a/include/linux/klist.h b/include/linux/klist.h index a370ce57cf1d..61e5b723ae73 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -44,7 +44,7 @@ struct klist_node { extern void klist_add_tail(struct klist_node *n, struct klist *k); extern void klist_add_head(struct klist_node *n, struct klist *k); -extern void klist_add_after(struct klist_node *n, struct klist_node *pos); +extern void klist_add_behind(struct klist_node *n, struct klist_node *pos); extern void klist_add_before(struct klist_node *n, struct klist_node *pos); extern void klist_del(struct klist_node *n); diff --git a/include/linux/list.h b/include/linux/list.h index ef9594171062..cbbb96fcead9 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -654,15 +654,15 @@ static inline void hlist_add_before(struct hlist_node *n, *(n->pprev) = n; } -static inline void hlist_add_after(struct hlist_node *n, - struct hlist_node *next) +static inline void hlist_add_behind(struct hlist_node *n, + struct hlist_node *prev) { - next->next = n->next; - n->next = next; - next->pprev = &n->next; + n->next = prev->next; + prev->next = n; + n->pprev = &prev->next; - if(next->next) - next->next->pprev = &next->next; + if (n->next) + n->next->pprev = &n->next; } /* after that we'll appear to be on some hlist and hlist_del will work */ diff --git a/include/linux/memblock.h b/include/linux/memblock.h index b660e05b63d4..e8cc45307f8f 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -249,7 +249,7 @@ phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); /* * Set the allocation direction to bottom-up or top-down. */ -static inline void memblock_set_bottom_up(bool enable) +static inline void __init memblock_set_bottom_up(bool enable) { memblock.bottom_up = enable; } @@ -264,7 +264,7 @@ static inline bool memblock_bottom_up(void) return memblock.bottom_up; } #else -static inline void memblock_set_bottom_up(bool enable) {} +static inline void __init memblock_set_bottom_up(bool enable) {} static inline bool memblock_bottom_up(void) { return false; } #endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index eb65d29516ca..e0752d204d9e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -54,39 +54,20 @@ struct mem_cgroup_reclaim_cookie { }; #ifdef CONFIG_MEMCG -/* - * All "charge" functions with gfp_mask should use GFP_KERNEL or - * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't - * alloc memory but reclaims memory from all available zones. So, "where I want - * memory from" bits of gfp_mask has no meaning. So any bits of that field is - * available but adding a rule is better. charge functions' gfp_mask should - * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous - * codes. - * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.) - */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp); +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare); +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); +void mem_cgroup_uncharge(struct page *page); +void mem_cgroup_uncharge_list(struct list_head *page_list); -extern int mem_cgroup_charge_anon(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); -/* for swap handling */ -extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t mask, struct mem_cgroup **memcgp); -extern void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg); -extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg); - -extern int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask); +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare); struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *); struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *); -/* For coalescing uncharge for reducing memcg' overhead*/ -extern void mem_cgroup_uncharge_start(void); -extern void mem_cgroup_uncharge_end(void); - -extern void mem_cgroup_uncharge_page(struct page *page); -extern void mem_cgroup_uncharge_cache_page(struct page *page); - bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); bool task_in_mem_cgroup(struct task_struct *task, @@ -113,12 +94,6 @@ bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); -extern void -mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp); -extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok); - struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); @@ -133,8 +108,6 @@ unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list); void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int); extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); -extern void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage); static inline void mem_cgroup_oom_enable(void) { @@ -233,46 +206,36 @@ void mem_cgroup_print_bad_page(struct page *page); #else /* CONFIG_MEMCG */ struct mem_cgroup; -static inline int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - return 0; -} - -static inline int mem_cgroup_charge_file(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - return 0; -} - -static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp) +static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, + struct mem_cgroup **memcgp) { + *memcgp = NULL; return 0; } -static inline void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) -{ -} - -static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +static inline void mem_cgroup_commit_charge(struct page *page, + struct mem_cgroup *memcg, + bool lrucare) { } -static inline void mem_cgroup_uncharge_start(void) +static inline void mem_cgroup_cancel_charge(struct page *page, + struct mem_cgroup *memcg) { } -static inline void mem_cgroup_uncharge_end(void) +static inline void mem_cgroup_uncharge(struct page *page) { } -static inline void mem_cgroup_uncharge_page(struct page *page) +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } -static inline void mem_cgroup_uncharge_cache_page(struct page *page) +static inline void mem_cgroup_migrate(struct page *oldpage, + struct page *newpage, + bool lrucare) { } @@ -311,17 +274,6 @@ static inline struct cgroup_subsys_state return NULL; } -static inline void -mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) -{ -} - -static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) -{ -} - static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, @@ -417,10 +369,6 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { } -static inline void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) -{ -} #endif /* CONFIG_MEMCG */ #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 010d125bffbf..79dd9eca054f 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -26,11 +26,12 @@ enum { MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, }; -/* Types for control the zone type of onlined memory */ +/* Types for control the zone type of onlined and offlined memory */ enum { - ONLINE_KEEP, - ONLINE_KERNEL, - ONLINE_MOVABLE, + MMOP_OFFLINE = -1, + MMOP_ONLINE_KEEP, + MMOP_ONLINE_KERNEL, + MMOP_ONLINE_MOVABLE, }; /* diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 96c5750e3110..21bff4be4379 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -461,6 +461,7 @@ static inline void mm_init_cpumask(struct mm_struct *mm) #ifdef CONFIG_CPUMASK_OFFSTACK mm->cpu_vm_mask_var = &mm->cpumask_allocation; #endif + cpumask_clear(mm->cpu_vm_mask_var); } /* Future-safe accessor for struct mm_struct's cpu_vm_mask. */ diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index edd82a105220..2f348d02f640 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -20,11 +20,13 @@ extern void dump_page_badflags(struct page *page, const char *reason, } while (0) #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) +#define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_DEBUG_VIRTUAL diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 6cbd1b6c3d20..318df7051850 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -143,6 +143,7 @@ enum zone_stat_item { NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ + NR_PAGES_SCANNED, /* pages scanned since last reclaim */ #ifdef CONFIG_NUMA NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ @@ -324,19 +325,12 @@ enum zone_type { #ifndef __GENERATING_BOUNDS_H struct zone { - /* Fields commonly accessed by the page allocator */ + /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long watermark[NR_WMARK]; /* - * When free pages are below this point, additional steps are taken - * when reading the number of free pages to avoid per-cpu counter - * drift allowing watermarks to be breached - */ - unsigned long percpu_drift_mark; - - /* * We don't know if the memory that we're going to allocate will be freeable * or/and it will be released eventually, so to avoid totally wasting several * GB of ram we must reserve some of the lower zone memory (otherwise we risk @@ -344,41 +338,26 @@ struct zone { * on the higher zones). This array is recalculated at runtime if the * sysctl_lowmem_reserve_ratio sysctl changes. */ - unsigned long lowmem_reserve[MAX_NR_ZONES]; - - /* - * This is a per-zone reserve of pages that should not be - * considered dirtyable memory. - */ - unsigned long dirty_balance_reserve; + long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NUMA int node; +#endif + /* - * zone reclaim becomes active if more unmapped pages exist. + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on + * this zone's LRU. Maintained by the pageout code. */ - unsigned long min_unmapped_pages; - unsigned long min_slab_pages; -#endif + unsigned int inactive_ratio; + + struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; + /* - * free areas of different sizes + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. */ - spinlock_t lock; -#if defined CONFIG_COMPACTION || defined CONFIG_CMA - /* Set to true when the PG_migrate_skip bits should be cleared */ - bool compact_blockskip_flush; - - /* pfn where compaction free scanner should start */ - unsigned long compact_cached_free_pfn; - /* pfn where async and sync compaction migration scanner should start */ - unsigned long compact_cached_migrate_pfn[2]; -#endif -#ifdef CONFIG_MEMORY_HOTPLUG - /* see spanned/present_pages for more description */ - seqlock_t span_seqlock; -#endif - struct free_area free_area[MAX_ORDER]; + unsigned long dirty_balance_reserve; #ifndef CONFIG_SPARSEMEM /* @@ -388,74 +367,14 @@ struct zone { unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ -#ifdef CONFIG_COMPACTION - /* - * On compaction failure, 1<<compact_defer_shift compactions - * are skipped before trying again. The number attempted since - * last failure is tracked with compact_considered. - */ - unsigned int compact_considered; - unsigned int compact_defer_shift; - int compact_order_failed; -#endif - - ZONE_PADDING(_pad1_) - - /* Fields commonly accessed by the page reclaim scanner */ - spinlock_t lru_lock; - struct lruvec lruvec; - - /* Evictions & activations on the inactive file list */ - atomic_long_t inactive_age; - - unsigned long pages_scanned; /* since last reclaim */ - unsigned long flags; /* zone flags, see below */ - - /* Zone statistics */ - atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; - - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this zone's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - - - ZONE_PADDING(_pad2_) - /* Rarely used or read-mostly fields */ - +#ifdef CONFIG_NUMA /* - * wait_table -- the array holding the hash table - * wait_table_hash_nr_entries -- the size of the hash table array - * wait_table_bits -- wait_table_size == (1 << wait_table_bits) - * - * The purpose of all these is to keep track of the people - * waiting for a page to become available and make them - * runnable again when possible. The trouble is that this - * consumes a lot of space, especially when so few things - * wait on pages at a given time. So instead of using - * per-page waitqueues, we use a waitqueue hash table. - * - * The bucket discipline is to sleep on the same queue when - * colliding and wake all in that wait queue when removing. - * When something wakes, it must check to be sure its page is - * truly available, a la thundering herd. The cost of a - * collision is great, but given the expected load of the - * table, they should be so rare as to be outweighed by the - * benefits from the saved space. - * - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the - * primary users of these fields, and in mm/page_alloc.c - * free_area_init_core() performs the initialization of them. + * zone reclaim becomes active if more unmapped pages exist. */ - wait_queue_head_t * wait_table; - unsigned long wait_table_hash_nr_entries; - unsigned long wait_table_bits; + unsigned long min_unmapped_pages; + unsigned long min_slab_pages; +#endif /* CONFIG_NUMA */ - /* - * Discontig memory support fields. - */ - struct pglist_data *zone_pgdat; /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; @@ -500,9 +419,11 @@ struct zone { * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ + unsigned long managed_pages; unsigned long spanned_pages; unsigned long present_pages; - unsigned long managed_pages; + + const char *name; /* * Number of MIGRATE_RESEVE page block. To maintain for just @@ -510,10 +431,94 @@ struct zone { */ int nr_migrate_reserve_block; +#ifdef CONFIG_MEMORY_HOTPLUG + /* see spanned/present_pages for more description */ + seqlock_t span_seqlock; +#endif + /* - * rarely used fields: + * wait_table -- the array holding the hash table + * wait_table_hash_nr_entries -- the size of the hash table array + * wait_table_bits -- wait_table_size == (1 << wait_table_bits) + * + * The purpose of all these is to keep track of the people + * waiting for a page to become available and make them + * runnable again when possible. The trouble is that this + * consumes a lot of space, especially when so few things + * wait on pages at a given time. So instead of using + * per-page waitqueues, we use a waitqueue hash table. + * + * The bucket discipline is to sleep on the same queue when + * colliding and wake all in that wait queue when removing. + * When something wakes, it must check to be sure its page is + * truly available, a la thundering herd. The cost of a + * collision is great, but given the expected load of the + * table, they should be so rare as to be outweighed by the + * benefits from the saved space. + * + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the + * primary users of these fields, and in mm/page_alloc.c + * free_area_init_core() performs the initialization of them. */ - const char *name; + wait_queue_head_t *wait_table; + unsigned long wait_table_hash_nr_entries; + unsigned long wait_table_bits; + + ZONE_PADDING(_pad1_) + + /* Write-intensive fields used from the page allocator */ + spinlock_t lock; + + /* free areas of different sizes */ + struct free_area free_area[MAX_ORDER]; + + /* zone flags, see below */ + unsigned long flags; + + ZONE_PADDING(_pad2_) + + /* Write-intensive fields used by page reclaim */ + + /* Fields commonly accessed by the page reclaim scanner */ + spinlock_t lru_lock; + struct lruvec lruvec; + + /* Evictions & activations on the inactive file list */ + atomic_long_t inactive_age; + + /* + * When free pages are below this point, additional steps are taken + * when reading the number of free pages to avoid per-cpu counter + * drift allowing watermarks to be breached + */ + unsigned long percpu_drift_mark; + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where compaction free scanner should start */ + unsigned long compact_cached_free_pfn; + /* pfn where async and sync compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[2]; +#endif + +#ifdef CONFIG_COMPACTION + /* + * On compaction failure, 1<<compact_defer_shift compactions + * are skipped before trying again. The number attempted since + * last failure is tracked with compact_considered. + */ + unsigned int compact_considered; + unsigned int compact_defer_shift; + int compact_order_failed; +#endif + +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* Set to true when the PG_migrate_skip bits should be cleared */ + bool compact_blockskip_flush; +#endif + + ZONE_PADDING(_pad3_) + /* Zone statistics */ + atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; typedef enum { @@ -529,6 +534,7 @@ typedef enum { ZONE_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ + ZONE_FAIR_DEPLETED, /* fair zone policy batch depleted */ } zone_flags_t; static inline void zone_set_flag(struct zone *zone, zone_flags_t flag) @@ -566,6 +572,11 @@ static inline int zone_is_reclaim_locked(const struct zone *zone) return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags); } +static inline int zone_is_fair_depleted(const struct zone *zone) +{ + return test_bit(ZONE_FAIR_DEPLETED, &zone->flags); +} + static inline int zone_is_oom_locked(const struct zone *zone) { return test_bit(ZONE_OOM_LOCKED, &zone->flags); @@ -872,6 +883,8 @@ static inline int zone_movable_is_highmem(void) { #if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) return movable_zone == ZONE_HIGHMEM; +#elif defined(CONFIG_HIGHMEM) + return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #else return 0; #endif diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8304959ad336..e1f5fcd79792 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -171,13 +171,12 @@ static inline int __TestClearPage##uname(struct page *page) \ #define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \ __SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname) -#define PAGEFLAG_FALSE(uname) \ -static inline int Page##uname(const struct page *page) \ - { return 0; } - #define TESTSCFLAG(uname, lname) \ TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname) +#define TESTPAGEFLAG_FALSE(uname) \ +static inline int Page##uname(const struct page *page) { return 0; } + #define SETPAGEFLAG_NOOP(uname) \ static inline void SetPage##uname(struct page *page) { } @@ -187,12 +186,21 @@ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname) \ static inline void __ClearPage##uname(struct page *page) { } +#define TESTSETFLAG_FALSE(uname) \ +static inline int TestSetPage##uname(struct page *page) { return 0; } + #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define __TESTCLEARFLAG_FALSE(uname) \ static inline int __TestClearPage##uname(struct page *page) { return 0; } +#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ + SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) + +#define TESTSCFLAG_FALSE(uname) \ + TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) + struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) @@ -248,7 +256,6 @@ PAGEFLAG_FALSE(HighMem) PAGEFLAG(SwapCache, swapcache) #else PAGEFLAG_FALSE(SwapCache) - SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) @@ -258,8 +265,8 @@ PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) PAGEFLAG(Mlocked, mlocked) __CLEARPAGEFLAG(Mlocked, mlocked) TESTSCFLAG(Mlocked, mlocked) __TESTCLEARFLAG(Mlocked, mlocked) #else -PAGEFLAG_FALSE(Mlocked) SETPAGEFLAG_NOOP(Mlocked) - TESTCLEARFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) +PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) + TESTSCFLAG_FALSE(Mlocked) __TESTCLEARFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index 777a524716db..5c831f1eca79 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -3,17 +3,15 @@ enum { /* flags for mem_cgroup */ - PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ - PCG_USED, /* this object is in use. */ - PCG_MIGRATION, /* under page migration */ - __NR_PCG_FLAGS, + PCG_USED = 0x01, /* This page is charged to a memcg */ + PCG_MEM = 0x02, /* This page holds a memory charge */ + PCG_MEMSW = 0x04, /* This page holds a memory+swap charge */ }; -#ifndef __GENERATING_BOUNDS_H -#include <generated/bounds.h> +struct pglist_data; #ifdef CONFIG_MEMCG -#include <linux/bit_spinlock.h> +struct mem_cgroup; /* * Page Cgroup can be considered as an extended mem_map. @@ -27,65 +25,30 @@ struct page_cgroup { struct mem_cgroup *mem_cgroup; }; -void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat); +extern void pgdat_page_cgroup_init(struct pglist_data *pgdat); #ifdef CONFIG_SPARSEMEM -static inline void __init page_cgroup_init_flatmem(void) +static inline void page_cgroup_init_flatmem(void) { } -extern void __init page_cgroup_init(void); +extern void page_cgroup_init(void); #else -void __init page_cgroup_init_flatmem(void); -static inline void __init page_cgroup_init(void) +extern void page_cgroup_init_flatmem(void); +static inline void page_cgroup_init(void) { } #endif struct page_cgroup *lookup_page_cgroup(struct page *page); -struct page *lookup_cgroup_page(struct page_cgroup *pc); - -#define TESTPCGFLAG(uname, lname) \ -static inline int PageCgroup##uname(struct page_cgroup *pc) \ - { return test_bit(PCG_##lname, &pc->flags); } - -#define SETPCGFLAG(uname, lname) \ -static inline void SetPageCgroup##uname(struct page_cgroup *pc)\ - { set_bit(PCG_##lname, &pc->flags); } - -#define CLEARPCGFLAG(uname, lname) \ -static inline void ClearPageCgroup##uname(struct page_cgroup *pc) \ - { clear_bit(PCG_##lname, &pc->flags); } - -#define TESTCLEARPCGFLAG(uname, lname) \ -static inline int TestClearPageCgroup##uname(struct page_cgroup *pc) \ - { return test_and_clear_bit(PCG_##lname, &pc->flags); } - -TESTPCGFLAG(Used, USED) -CLEARPCGFLAG(Used, USED) -SETPCGFLAG(Used, USED) - -SETPCGFLAG(Migration, MIGRATION) -CLEARPCGFLAG(Migration, MIGRATION) -TESTPCGFLAG(Migration, MIGRATION) -static inline void lock_page_cgroup(struct page_cgroup *pc) +static inline int PageCgroupUsed(struct page_cgroup *pc) { - /* - * Don't take this lock in IRQ context. - * This lock is for pc->mem_cgroup, USED, MIGRATION - */ - bit_spin_lock(PCG_LOCK, &pc->flags); + return !!(pc->flags & PCG_USED); } - -static inline void unlock_page_cgroup(struct page_cgroup *pc) -{ - bit_spin_unlock(PCG_LOCK, &pc->flags); -} - -#else /* CONFIG_MEMCG */ +#else /* !CONFIG_MEMCG */ struct page_cgroup; -static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) +static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat) { } @@ -98,10 +61,9 @@ static inline void page_cgroup_init(void) { } -static inline void __init page_cgroup_init_flatmem(void) +static inline void page_cgroup_init_flatmem(void) { } - #endif /* CONFIG_MEMCG */ #include <linux/swap.h> @@ -140,6 +102,4 @@ static inline void swap_cgroup_swapoff(int type) #endif /* CONFIG_MEMCG_SWAP */ -#endif /* !__GENERATING_BOUNDS_H */ - #endif /* __LINUX_PAGE_CGROUP_H */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 0a97b583ee8d..3df8c7db7a4e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -399,6 +399,18 @@ static inline struct page *read_mapping_page(struct address_space *mapping, } /* + * Get the offset in PAGE_SIZE. + * (TODO: hugepage should have ->index in PAGE_SIZE) + */ +static inline pgoff_t page_to_pgoff(struct page *page) +{ + if (unlikely(PageHeadHuge(page))) + return page->index << compound_order(page); + else + return page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); +} + +/* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) @@ -472,6 +484,9 @@ static inline int lock_page_killable(struct page *page) /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. + * + * Return value and mmap_sem implications depend on flags; see + * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) diff --git a/include/linux/printk.h b/include/linux/printk.h index 319ff7e53efb..0990997a5304 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -31,7 +31,7 @@ static inline const char *printk_skip_level(const char *buffer) } /* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL +#define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT /* We show everything that is MORE important than this.. */ #define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 8183b46fbaa2..372ad5e0dcb8 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -432,9 +432,9 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, } /** - * hlist_add_after_rcu - * @prev: the existing element to add the new element after. + * hlist_add_behind_rcu * @n: the new element to add to the hash list. + * @prev: the existing element to add the new element after. * * Description: * Adds the specified element to the specified hlist @@ -449,8 +449,8 @@ static inline void hlist_add_before_rcu(struct hlist_node *n, * hlist_for_each_entry_rcu(), used to prevent memory-consistency * problems on Alpha CPUs. */ -static inline void hlist_add_after_rcu(struct hlist_node *prev, - struct hlist_node *n) +static inline void hlist_add_behind_rcu(struct hlist_node *n, + struct hlist_node *prev) { n->next = prev->next; n->pprev = &prev->next; diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index a964f7285600..4b152c81c5fa 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -136,7 +136,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, static inline void sg_chain(struct scatterlist *prv, unsigned int prv_nents, struct scatterlist *sgl) { -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN BUG(); #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 42cac4dc2157..72d06cc99db0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -33,6 +33,7 @@ struct sched_param { #include <linux/smp.h> #include <linux/sem.h> +#include <linux/shm.h> #include <linux/signal.h> #include <linux/compiler.h> #include <linux/completion.h> @@ -1386,6 +1387,7 @@ struct task_struct { #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ @@ -1629,12 +1631,6 @@ struct task_struct { unsigned long trace_recursion; #endif /* CONFIG_TRACING */ #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; struct memcg_oom_info { struct mem_cgroup *memcg; @@ -2955,15 +2951,10 @@ static inline void inc_syscw(struct task_struct *tsk) #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); -extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - -static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ -} #endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, diff --git a/include/linux/shm.h b/include/linux/shm.h index 57d77709fbe2..6fb801686ad6 100644 --- a/include/linux/shm.h +++ b/include/linux/shm.h @@ -1,6 +1,7 @@ #ifndef _LINUX_SHM_H_ #define _LINUX_SHM_H_ +#include <linux/list.h> #include <asm/page.h> #include <uapi/linux/shm.h> #include <asm/shmparam.h> @@ -20,6 +21,7 @@ struct shmid_kernel /* private to the kernel */ /* The task created the shm object. NULL if the task is dead. */ struct task_struct *shm_creator; + struct list_head shm_clist; /* list by creator */ }; /* shm_mode upper byte flags */ @@ -44,11 +46,20 @@ struct shmid_kernel /* private to the kernel */ #define SHM_HUGE_1GB (30 << SHM_HUGE_SHIFT) #ifdef CONFIG_SYSVIPC +struct sysv_shm { + struct list_head shm_clist; +}; + long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba); -extern int is_file_shm_hugepages(struct file *file); -extern void exit_shm(struct task_struct *task); +int is_file_shm_hugepages(struct file *file); +void exit_shm(struct task_struct *task); +#define shm_init_task(task) INIT_LIST_HEAD(&(task)->sysvshm.shm_clist) #else +struct sysv_shm { + /* empty */ +}; + static inline long do_shmat(int shmid, char __user *shmaddr, int shmflg, unsigned long *addr, unsigned long shmlba) @@ -62,6 +73,9 @@ static inline int is_file_shm_hugepages(struct file *file) static inline void exit_shm(struct task_struct *task) { } +static inline void shm_init_task(struct task_struct *task) +{ +} #endif #endif /* _LINUX_SHM_H_ */ diff --git a/include/linux/string.h b/include/linux/string.h index d36977e029af..2663afcc0861 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -117,6 +117,7 @@ void *memchr_inv(const void *s, int c, size_t n); extern char *kstrdup(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); +extern char *kstrimdup(const char *s, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); diff --git a/include/linux/swap.h b/include/linux/swap.h index 4bdbee80eede..1b72060f093a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,7 +311,6 @@ extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); -extern void init_page_accessed(struct page *page); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); @@ -321,6 +320,9 @@ extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); +extern void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma); + /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); @@ -379,9 +381,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) } #endif #ifdef CONFIG_MEMCG_SWAP -extern void mem_cgroup_uncharge_swap(swp_entry_t ent); +extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); +extern void mem_cgroup_uncharge_swap(swp_entry_t entry); #else -static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) +static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ +} +static inline void mem_cgroup_uncharge_swap(swp_entry_t entry) { } #endif @@ -441,7 +447,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); -extern void swapcache_free(swp_entry_t, struct page *page); +extern void swapcache_free(swp_entry_t); extern int free_swap_and_cache(swp_entry_t); extern int swap_type_of(dev_t, sector_t, struct block_device **); extern unsigned int count_swap_pages(int, int); @@ -505,7 +511,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void swapcache_free(swp_entry_t swp, struct page *page) +static inline void swapcache_free(swp_entry_t swp) { } diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index cd82f72fc908..f7e4b5241f82 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -317,6 +317,10 @@ asmlinkage long sys_restart_syscall(void); asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, struct kexec_segment __user *segments, unsigned long flags); +asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd, + unsigned long cmdline_len, + const char __user *cmdline_ptr, + unsigned long flags); asmlinkage long sys_exit(int error_code); asmlinkage long sys_exit_group(int error_code); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 14a8ff2de11e..b7361f831226 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -34,8 +34,6 @@ struct ctl_table_root; struct ctl_table_header; struct ctl_dir; -typedef struct ctl_table ctl_table; - typedef int proc_handler (struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 4836ba3c1cd8..e95372654f09 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -57,9 +57,9 @@ static inline void put_user_ns(struct user_namespace *ns) } struct seq_operations; -extern struct seq_operations proc_uid_seq_operations; -extern struct seq_operations proc_gid_seq_operations; -extern struct seq_operations proc_projid_seq_operations; +extern const struct seq_operations proc_uid_seq_operations; +extern const struct seq_operations proc_gid_seq_operations; +extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4b8a89189a29..b87696fdf06a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -113,7 +113,7 @@ extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page ***pages); + struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); diff --git a/include/linux/zbud.h b/include/linux/zbud.h index 13af0d450bf6..f9d41a6e361f 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h @@ -11,7 +11,7 @@ struct zbud_ops { struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); void zbud_destroy_pool(struct zbud_pool *pool); -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle); void zbud_free(struct zbud_pool *pool, unsigned long handle); int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); diff --git a/include/linux/zlib.h b/include/linux/zlib.h index 9c5a6b4de0a3..92dbbd3f6c75 100644 --- a/include/linux/zlib.h +++ b/include/linux/zlib.h @@ -83,11 +83,11 @@ struct internal_state; typedef struct z_stream_s { const Byte *next_in; /* next input byte */ - uInt avail_in; /* number of bytes available at next_in */ + uLong avail_in; /* number of bytes available at next_in */ uLong total_in; /* total nb of input bytes read so far */ Byte *next_out; /* next output byte should be put there */ - uInt avail_out; /* remaining free space at next_out */ + uLong avail_out; /* remaining free space at next_out */ uLong total_out; /* total nb of bytes output so far */ char *msg; /* last error message, NULL if no error */ @@ -493,64 +493,6 @@ extern int deflateInit2 (z_streamp strm, method). msg is set to null if there is no error message. deflateInit2 does not perform any compression: this will be done by deflate(). */ - -#if 0 -extern int zlib_deflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -#endif -/* - Initializes the compression dictionary from the given byte sequence - without producing any compressed output. This function must be called - immediately after deflateInit, deflateInit2 or deflateReset, before any - call of deflate. The compressor and decompressor must use exactly the same - dictionary (see inflateSetDictionary). - - The dictionary should consist of strings (byte sequences) that are likely - to be encountered later in the data to be compressed, with the most commonly - used strings preferably put towards the end of the dictionary. Using a - dictionary is most useful when the data to be compressed is short and can be - predicted with good accuracy; the data can then be compressed better than - with the default empty dictionary. - - Depending on the size of the compression data structures selected by - deflateInit or deflateInit2, a part of the dictionary may in effect be - discarded, for example if the dictionary is larger than the window size in - deflate or deflate2. Thus the strings most likely to be useful should be - put at the end of the dictionary, not at the front. - - Upon return of this function, strm->adler is set to the Adler32 value - of the dictionary; the decompressor may later use this value to determine - which dictionary has been used by the compressor. (The Adler32 value - applies to the whole dictionary even if only a subset of the dictionary is - actually used by the compressor.) - - deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent (for example if deflate has already been called for this stream - or if the compression method is bsort). deflateSetDictionary does not - perform any compression: this will be done by deflate(). -*/ - -#if 0 -extern int zlib_deflateCopy (z_streamp dest, z_streamp source); -#endif - -/* - Sets the destination stream as a complete copy of the source stream. - - This function can be useful when several compression strategies will be - tried, for example when there are several ways of pre-processing the input - data with a filter. The streams that will be discarded should then be freed - by calling deflateEnd. Note that deflateCopy duplicates the internal - compression state which can be quite large, so this strategy is slow and - can consume lots of memory. - - deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not - enough memory, Z_STREAM_ERROR if the source stream state was inconsistent - (such as zalloc being NULL). msg is left unchanged in both source and - destination. -*/ extern int zlib_deflateReset (z_streamp strm); /* @@ -568,27 +510,6 @@ static inline unsigned long deflateBound(unsigned long s) return s + ((s + 7) >> 3) + ((s + 63) >> 6) + 11; } -#if 0 -extern int zlib_deflateParams (z_streamp strm, int level, int strategy); -#endif -/* - Dynamically update the compression level and compression strategy. The - interpretation of level and strategy is as in deflateInit2. This can be - used to switch between compression and straight copy of the input data, or - to switch to a different kind of input data requiring a different - strategy. If the compression level is changed, the input available so far - is compressed with the old level (and may be flushed); the new level will - take effect only at the next call of deflate(). - - Before the call of deflateParams, the stream state must be set as for - a call of deflate(), since the currently available input may have to - be compressed and flushed. In particular, strm->avail_out must be non-zero. - - deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source - stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR - if strm->avail_out was zero. -*/ - /* extern int inflateInit2 (z_streamp strm, int windowBits); @@ -631,45 +552,6 @@ extern int inflateInit2 (z_streamp strm, int windowBits); and avail_out are unchanged.) */ -extern int zlib_inflateSetDictionary (z_streamp strm, - const Byte *dictionary, - uInt dictLength); -/* - Initializes the decompression dictionary from the given uncompressed byte - sequence. This function must be called immediately after a call of inflate, - if that call returned Z_NEED_DICT. The dictionary chosen by the compressor - can be determined from the adler32 value returned by that call of inflate. - The compressor and decompressor must use exactly the same dictionary (see - deflateSetDictionary). For raw inflate, this function can be called - immediately after inflateInit2() or inflateReset() and before any call of - inflate() to set the dictionary. The application must insure that the - dictionary that was used for compression is provided. - - inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a - parameter is invalid (such as NULL dictionary) or the stream state is - inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the - expected one (incorrect adler32 value). inflateSetDictionary does not - perform any decompression: this will be done by subsequent calls of - inflate(). -*/ - -#if 0 -extern int zlib_inflateSync (z_streamp strm); -#endif -/* - Skips invalid compressed data until a full flush point (see above the - description of deflate with Z_FULL_FLUSH) can be found, or until all - available input is skipped. No output is provided. - - inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR - if no more input was provided, Z_DATA_ERROR if no flush point has been found, - or Z_STREAM_ERROR if the stream structure was inconsistent. In the success - case, the application may save the current current value of total_in which - indicates where valid compressed data was found. In the error case, the - application may repeatedly call inflateSync, providing more input each time, - until success or end of the input data. -*/ - extern int zlib_inflateReset (z_streamp strm); /* This function is equivalent to inflateEnd followed by inflateInit, diff --git a/include/linux/zpool.h b/include/linux/zpool.h new file mode 100644 index 000000000000..f14bd75f08b3 --- /dev/null +++ b/include/linux/zpool.h @@ -0,0 +1,106 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for the zbud and zsmalloc memory + * storage pool implementations. Typically, this is used to + * store compressed memory. + */ + +#ifndef _ZPOOL_H_ +#define _ZPOOL_H_ + +struct zpool; + +struct zpool_ops { + int (*evict)(struct zpool *pool, unsigned long handle); +}; + +/* + * Control how a handle is mapped. It will be ignored if the + * implementation does not support it. Its use is optional. + * Note that this does not refer to memory protection, it + * refers to how the memory will be copied in/out if copying + * is necessary during mapping; read-write is the safest as + * it copies the existing memory in on map, and copies the + * changed memory back out on unmap. Write-only does not copy + * in the memory and should only be used for initialization. + * If in doubt, use ZPOOL_MM_DEFAULT which is read-write. + */ +enum zpool_mapmode { + ZPOOL_MM_RW, /* normal read-write mapping */ + ZPOOL_MM_RO, /* read-only (no copy-out at unmap time) */ + ZPOOL_MM_WO, /* write-only (no copy-in at map time) */ + + ZPOOL_MM_DEFAULT = ZPOOL_MM_RW +}; + +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops); + +char *zpool_get_type(struct zpool *pool); + +void zpool_destroy_pool(struct zpool *pool); + +int zpool_malloc(struct zpool *pool, size_t size, gfp_t gfp, + unsigned long *handle); + +void zpool_free(struct zpool *pool, unsigned long handle); + +int zpool_shrink(struct zpool *pool, unsigned int pages, + unsigned int *reclaimed); + +void *zpool_map_handle(struct zpool *pool, unsigned long handle, + enum zpool_mapmode mm); + +void zpool_unmap_handle(struct zpool *pool, unsigned long handle); + +u64 zpool_get_total_size(struct zpool *pool); + + +/** + * struct zpool_driver - driver implementation for zpool + * @type: name of the driver. + * @list: entry in the list of zpool drivers. + * @create: create a new pool. + * @destroy: destroy a pool. + * @malloc: allocate mem from a pool. + * @free: free mem from a pool. + * @shrink: shrink the pool. + * @map: map a handle. + * @unmap: unmap a handle. + * @total_size: get total size of a pool. + * + * This is created by a zpool implementation and registered + * with zpool. + */ +struct zpool_driver { + char *type; + struct module *owner; + atomic_t refcount; + struct list_head list; + + void *(*create)(gfp_t gfp, struct zpool_ops *ops); + void (*destroy)(void *pool); + + int (*malloc)(void *pool, size_t size, gfp_t gfp, + unsigned long *handle); + void (*free)(void *pool, unsigned long handle); + + int (*shrink)(void *pool, unsigned int pages, + unsigned int *reclaimed); + + void *(*map)(void *pool, unsigned long handle, + enum zpool_mapmode mm); + void (*unmap)(void *pool, unsigned long handle); + + u64 (*total_size)(void *pool); +}; + +void zpool_register_driver(struct zpool_driver *driver); + +int zpool_unregister_driver(struct zpool_driver *driver); + +int zpool_evict(void *pool, unsigned long handle); + +#endif diff --git a/include/scsi/scsi.h b/include/scsi/scsi.h index 91e2e4215ba0..4b69139d9839 100644 --- a/include/scsi/scsi.h +++ b/include/scsi/scsi.h @@ -31,7 +31,7 @@ enum scsi_timeouts { * Like SCSI_MAX_SG_SEGMENTS, but for archs that have sg chaining. This limit * is totally arbitrary, a setting of 2048 will get you at least 8mb ios. */ -#ifdef ARCH_HAS_SG_CHAIN +#ifdef CONFIG_ARCH_HAS_SG_CHAIN #define SCSI_MAX_SG_CHAIN_SEGMENTS 2048 #else #define SCSI_MAX_SG_CHAIN_SEGMENTS SCSI_MAX_SG_SEGMENTS diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h index 4e4f2f8b1ac2..dd2b5467d905 100644 --- a/include/trace/events/migrate.h +++ b/include/trace/events/migrate.h @@ -17,6 +17,7 @@ {MR_MEMORY_HOTPLUG, "memory_hotplug"}, \ {MR_SYSCALL, "syscall_or_cpuset"}, \ {MR_MEMPOLICY_MBIND, "mempolicy_mbind"}, \ + {MR_NUMA_MISPLACED, "numa_misplaced"}, \ {MR_CMA, "cma"} TRACE_EVENT(mm_migrate_pages, diff --git a/include/trace/events/pagemap.h b/include/trace/events/pagemap.h index 1c9fabde69e4..ce0803b8d05f 100644 --- a/include/trace/events/pagemap.h +++ b/include/trace/events/pagemap.h @@ -28,12 +28,10 @@ TRACE_EVENT(mm_lru_insertion, TP_PROTO( struct page *page, - unsigned long pfn, - int lru, - unsigned long flags + int lru ), - TP_ARGS(page, pfn, lru, flags), + TP_ARGS(page, lru), TP_STRUCT__entry( __field(struct page *, page ) @@ -44,9 +42,9 @@ TRACE_EVENT(mm_lru_insertion, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); __entry->lru = lru; - __entry->flags = flags; + __entry->flags = trace_pagemap_flags(page); ), /* Flag format is based on page-types.c formatting for pagemap */ @@ -64,9 +62,9 @@ TRACE_EVENT(mm_lru_insertion, TRACE_EVENT(mm_lru_activate, - TP_PROTO(struct page *page, unsigned long pfn), + TP_PROTO(struct page *page), - TP_ARGS(page, pfn), + TP_ARGS(page), TP_STRUCT__entry( __field(struct page *, page ) @@ -75,7 +73,7 @@ TRACE_EVENT(mm_lru_activate, TP_fast_assign( __entry->page = page; - __entry->pfn = pfn; + __entry->pfn = page_to_pfn(page); ), /* Flag format is based on page-types.c formatting for pagemap */ diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h index d6629d49a243..6925f5b42f89 100644 --- a/include/uapi/linux/kexec.h +++ b/include/uapi/linux/kexec.h @@ -13,6 +13,17 @@ #define KEXEC_PRESERVE_CONTEXT 0x00000002 #define KEXEC_ARCH_MASK 0xffff0000 +/* + * Kexec file load interface flags. + * KEXEC_FILE_UNLOAD : Unload already loaded kexec/kdump image. + * KEXEC_FILE_ON_CRASH : Load/unload operation belongs to kdump image. + * KEXEC_FILE_NO_INITRAMFS : No initramfs is being loaded. Ignore the initrd + * fd field. + */ +#define KEXEC_FILE_UNLOAD 0x00000001 +#define KEXEC_FILE_ON_CRASH 0x00000002 +#define KEXEC_FILE_NO_INITRAMFS 0x00000004 + /* These values match the ELF architecture values. * Unless there is a good reason that should continue to be the case. */ diff --git a/init/Kconfig b/init/Kconfig index 85fb9851b326..1dfdd81ff9bd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -783,8 +783,13 @@ endchoice endmenu # "RCU Subsystem" +config BUILD_BIN2C + bool + default n + config IKCONFIG tristate "Kernel .config support" + select BUILD_BIN2C ---help--- This option enables the complete Linux kernel ".config" file contents to be saved in the kernel. It provides documentation @@ -807,15 +812,53 @@ config LOG_BUF_SHIFT range 12 21 default 17 help - Select kernel log buffer size as a power of 2. + Select the minimal kernel log buffer size as a power of 2. + The final size is affected by LOG_CPU_MAX_BUF_SHIFT config + parameter, see below. Any higher size also might be forced + by "log_buf_len" boot parameter. + Examples: - 17 => 128 KB + 17 => 128 KB 16 => 64 KB - 15 => 32 KB - 14 => 16 KB + 15 => 32 KB + 14 => 16 KB 13 => 8 KB 12 => 4 KB +config LOG_CPU_MAX_BUF_SHIFT + int "CPU kernel log buffer size contribution (13 => 8 KB, 17 => 128KB)" + range 0 21 + default 12 if !BASE_SMALL + default 0 if BASE_SMALL + help + This option allows to increase the default ring buffer size + according to the number of CPUs. The value defines the contribution + of each CPU as a power of 2. The used space is typically only few + lines however it might be much more when problems are reported, + e.g. backtraces. + + The increased size means that a new buffer has to be allocated and + the original static one is unused. It makes sense only on systems + with more CPUs. Therefore this value is used only when the sum of + contributions is greater than the half of the default kernel ring + buffer as defined by LOG_BUF_SHIFT. The default values are set + so that more than 64 CPUs are needed to trigger the allocation. + + Also this option is ignored when "log_buf_len" kernel parameter is + used as it forces an exact (power of two) size of the ring buffer. + + The number of possible CPUs is used for this computation ignoring + hotplugging making the compuation optimal for the the worst case + scenerio while allowing a simple algorithm to be used from bootup. + + Examples shift values and their meaning: + 17 => 128 KB for each CPU + 16 => 64 KB for each CPU + 15 => 32 KB for each CPU + 14 => 16 KB for each CPU + 13 => 8 KB for each CPU + 12 => 4 KB for each CPU + # # Architectures with an unreliable sched_clock() should select this: # diff --git a/init/do_mounts.c b/init/do_mounts.c index 82f22885c87e..b6237c31b0e2 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -539,12 +539,6 @@ void __init prepare_namespace(void) { int is_floppy; - if (root_delay) { - printk(KERN_INFO "Waiting %d sec before mounting root device...\n", - root_delay); - ssleep(root_delay); - } - /* * wait for the known devices to complete their probing * @@ -571,6 +565,12 @@ void __init prepare_namespace(void) if (initrd_load()) goto out; + if (root_delay) { + pr_info("Waiting %d sec before mounting root device...\n", + root_delay); + ssleep(root_delay); + } + /* wait for any asynchronous scanning to complete */ if ((ROOT_DEV == 0) && root_wait) { printk(KERN_INFO "Waiting for root device %s...\n", diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index a8227022e3a0..e5d059e8aa11 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -311,9 +311,9 @@ static int exit_code; static int decompress_error; static int crd_infd, crd_outfd; -static int __init compr_fill(void *buf, unsigned int len) +static long __init compr_fill(void *buf, unsigned long len) { - int r = sys_read(crd_infd, buf, len); + long r = sys_read(crd_infd, buf, len); if (r < 0) printk(KERN_ERR "RAMDISK: error while reading compressed data"); else if (r == 0) @@ -321,13 +321,13 @@ static int __init compr_fill(void *buf, unsigned int len) return r; } -static int __init compr_flush(void *window, unsigned int outcnt) +static long __init compr_flush(void *window, unsigned long outcnt) { - int written = sys_write(crd_outfd, window, outcnt); + long written = sys_write(crd_outfd, window, outcnt); if (written != outcnt) { if (decompress_error == 0) printk(KERN_ERR - "RAMDISK: incomplete write (%d != %d)\n", + "RAMDISK: incomplete write (%ld != %ld)\n", written, outcnt); decompress_error = 1; return -1; diff --git a/init/initramfs.c b/init/initramfs.c index a8497fab1c3d..a7566031242e 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,29 @@ #include <linux/syscalls.h> #include <linux/utime.h> +static ssize_t __init xwrite(int fd, const char *p, size_t count) +{ + ssize_t out = 0; + + /* sys_write only can write MAX_RW_COUNT aka 2G-4K bytes at most */ + while (count) { + ssize_t rv = sys_write(fd, p, count); + + if (rv < 0) { + if (rv == -EINTR || rv == -EAGAIN) + continue; + return out ? out : rv; + } else if (rv == 0) + break; + + p += rv; + out += rv; + count -= rv; + } + + return out; +} + static __initdata char *message; static void __init error(char *x) { @@ -174,7 +197,7 @@ static __initdata enum state { } state, next_state; static __initdata char *victim; -static __initdata unsigned count; +static unsigned long count __initdata; static __initdata loff_t this_header, next_header; static inline void __init eat(unsigned n) @@ -186,7 +209,7 @@ static inline void __init eat(unsigned n) static __initdata char *vcollected; static __initdata char *collected; -static __initdata int remains; +static long remains __initdata; static __initdata char *collect; static void __init read_into(char *buf, unsigned size, enum state next) @@ -213,7 +236,7 @@ static int __init do_start(void) static int __init do_collect(void) { - unsigned n = remains; + unsigned long n = remains; if (count < n) n = count; memcpy(collect, victim, n); @@ -346,7 +369,7 @@ static int __init do_name(void) static int __init do_copy(void) { if (count >= body_len) { - sys_write(wfd, victim, body_len); + xwrite(wfd, victim, body_len); sys_close(wfd); do_utime(vcollected, mtime); kfree(vcollected); @@ -354,7 +377,7 @@ static int __init do_copy(void) state = SkipIt; return 0; } else { - sys_write(wfd, victim, count); + xwrite(wfd, victim, count); body_len -= count; eat(count); return 1; @@ -384,7 +407,7 @@ static __initdata int (*actions[])(void) = { [Reset] = do_reset, }; -static int __init write_buffer(char *buf, unsigned len) +static long __init write_buffer(char *buf, unsigned long len) { count = len; victim = buf; @@ -394,11 +417,11 @@ static int __init write_buffer(char *buf, unsigned len) return len - count; } -static int __init flush_buffer(void *bufv, unsigned len) +static long __init flush_buffer(void *bufv, unsigned long len) { char *buf = (char *) bufv; - int written; - int origLen = len; + long written; + long origLen = len; if (message) return -1; while ((written = write_buffer(buf, len)) < len && !message) { @@ -417,13 +440,13 @@ static int __init flush_buffer(void *bufv, unsigned len) return origLen; } -static unsigned my_inptr; /* index of next byte to be processed in inbuf */ +static unsigned long my_inptr; /* index of next byte to be processed in inbuf */ #include <linux/decompress/generic.h> -static char * __init unpack_to_rootfs(char *buf, unsigned len) +static char * __init unpack_to_rootfs(char *buf, unsigned long len) { - int written, res; + long written; decompress_fn decompress; const char *compress_name; static __initdata char msg_buf[64]; @@ -457,7 +480,7 @@ static char * __init unpack_to_rootfs(char *buf, unsigned len) decompress = decompress_method(buf, len, &compress_name); pr_debug("Detected %s compressed data\n", compress_name); if (decompress) { - res = decompress(buf, len, NULL, flush_buffer, NULL, + int res = decompress(buf, len, NULL, flush_buffer, NULL, &my_inptr, error); if (res) error("decompressor failed"); @@ -603,8 +626,13 @@ static int __init populate_rootfs(void) fd = sys_open("/initrd.image", O_WRONLY|O_CREAT, 0700); if (fd >= 0) { - sys_write(fd, (char *)initrd_start, - initrd_end - initrd_start); + ssize_t written = xwrite(fd, (char *)initrd_start, + initrd_end - initrd_start); + + if (written != initrd_end - initrd_start) + pr_err("/initrd.image: incomplete write (%zd != %ld)\n", + written, initrd_end - initrd_start); + sys_close(fd); free_initrd(); } diff --git a/ipc/shm.c b/ipc/shm.c index 89fc354156cb..7fc9f9f3a26b 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -178,6 +178,7 @@ static void shm_rcu_free(struct rcu_head *head) static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) { + list_del(&s->shm_clist); ipc_rmid(&shm_ids(ns), &s->shm_perm); } @@ -268,37 +269,6 @@ static void shm_close(struct vm_area_struct *vma) } /* Called with ns->shm_ids(ns).rwsem locked */ -static int shm_try_destroy_current(int id, void *p, void *data) -{ - struct ipc_namespace *ns = data; - struct kern_ipc_perm *ipcp = p; - struct shmid_kernel *shp = container_of(ipcp, struct shmid_kernel, shm_perm); - - if (shp->shm_creator != current) - return 0; - - /* - * Mark it as orphaned to destroy the segment when - * kernel.shm_rmid_forced is changed. - * It is noop if the following shm_may_destroy() returns true. - */ - shp->shm_creator = NULL; - - /* - * Don't even try to destroy it. If shm_rmid_forced=0 and IPC_RMID - * is not set, it shouldn't be deleted here. - */ - if (!ns->shm_rmid_forced) - return 0; - - if (shm_may_destroy(ns, shp)) { - shm_lock_by_ptr(shp); - shm_destroy(ns, shp); - } - return 0; -} - -/* Called with ns->shm_ids(ns).rwsem locked */ static int shm_try_destroy_orphaned(int id, void *p, void *data) { struct ipc_namespace *ns = data; @@ -329,18 +299,50 @@ void shm_destroy_orphaned(struct ipc_namespace *ns) up_write(&shm_ids(ns).rwsem); } - +/* Locking assumes this will only be called with task == current */ void exit_shm(struct task_struct *task) { struct ipc_namespace *ns = task->nsproxy->ipc_ns; + struct shmid_kernel *shp, *n; - if (shm_ids(ns).in_use == 0) + if (list_empty(&task->sysvshm.shm_clist)) return; - /* Destroy all already created segments, but not mapped yet */ + /* + * If kernel.shm_rmid_forced is not set then only keep track of + * which shmids are orphaned, so that a later set of the sysctl + * can clean them up. + */ + if (!ns->shm_rmid_forced) { + down_read(&shm_ids(ns).rwsem); + list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) + shp->shm_creator = NULL; + /* + * Only under read lock but we are only called on current + * so no entry on the list will be shared. + */ + list_del(&task->sysvshm.shm_clist); + up_read(&shm_ids(ns).rwsem); + return; + } + + /* + * Destroy all already created segments, that were not yet mapped, + * and mark any mapped as orphan to cover the sysctl toggling. + * Destroy is skipped if shm_may_destroy() returns false. + */ down_write(&shm_ids(ns).rwsem); - if (shm_ids(ns).in_use) - idr_for_each(&shm_ids(ns).ipcs_idr, &shm_try_destroy_current, ns); + list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { + shp->shm_creator = NULL; + + if (shm_may_destroy(ns, shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } + } + + /* Remove the list head from any segments still attached. */ + list_del(&task->sysvshm.shm_clist); up_write(&shm_ids(ns).rwsem); } @@ -561,6 +563,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) shp->shm_nattch = 0; shp->shm_file = file; shp->shm_creator = current; + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); /* * shmid gets reported as "inode#" in /proc/pid/maps. diff --git a/kernel/Makefile b/kernel/Makefile index 973a40cf8068..de62ac0bb571 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -104,7 +104,7 @@ targets += config_data.gz $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE $(call if_changed,gzip) - filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") + filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;") targets += config_data.h $(obj)/config_data.h: $(obj)/config_data.gz FORCE $(call filechk,ikconfiggz) diff --git a/kernel/acct.c b/kernel/acct.c index 3cec8c4cee14..98c4a209bef1 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -142,12 +142,12 @@ static int check_free_space(struct bsd_acct_struct *acct, struct file *file) if (acct->active) { if (act < 0) { acct->active = 0; - printk(KERN_INFO "Process accounting paused\n"); + pr_info("Process accounting paused\n"); } } else { if (act > 0) { acct->active = 1; - printk(KERN_INFO "Process accounting resumed\n"); + pr_info("Process accounting resumed\n"); } } @@ -262,6 +262,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name) if (name) { struct filename *tmp = getname(name); + if (IS_ERR(tmp)) return PTR_ERR(tmp); error = acct_on(tmp); @@ -391,7 +392,7 @@ static comp_t encode_comp_t(unsigned long value) return exp; } -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* * encode an u64 into a comp2_t (24 bits) * @@ -404,7 +405,7 @@ static comp_t encode_comp_t(unsigned long value) #define MANTSIZE2 20 /* 20 bit mantissa. */ #define EXPSIZE2 5 /* 5 bit base 2 exponent. */ #define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ -#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ +#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */ static comp2_t encode_comp2_t(u64 value) { @@ -435,7 +436,7 @@ static comp2_t encode_comp2_t(u64 value) } #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -444,8 +445,9 @@ static u32 encode_float(u64 value) unsigned exp = 190; unsigned u; - if (value==0) return 0; - while ((s64)value > 0){ + if (value == 0) + return 0; + while ((s64)value > 0) { value <<= 1; exp--; } @@ -505,16 +507,17 @@ static void do_acct_process(struct bsd_acct_struct *acct, + current->group_leader->start_time.tv_nsec; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_etime = encode_float(elapsed); #else ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? - (unsigned long) elapsed : (unsigned long) -1l); + (unsigned long) elapsed : (unsigned long) -1l); #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 { /* new enlarged etime field */ comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; ac.ac_etime_lo = (u16) etime; } @@ -524,15 +527,15 @@ static void do_acct_process(struct bsd_acct_struct *acct, /* we really need to bite the bullet and change layout */ ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid); ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid); -#if ACCT_VERSION==2 +#if ACCT_VERSION == 2 ac.ac_ahz = AHZ; #endif -#if ACCT_VERSION==1 || ACCT_VERSION==2 +#if ACCT_VERSION == 1 || ACCT_VERSION == 2 /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; #endif -#if ACCT_VERSION==3 +#if ACCT_VERSION == 3 ac.ac_pid = task_tgid_nr_ns(current, ns); rcu_read_lock(); ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); @@ -593,6 +596,7 @@ void acct_collect(long exitcode, int group_dead) if (group_dead && current->mm) { struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); vma = current->mm->mmap; while (vma) { diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 8e9bc9c3dbb7..c447cd9848d1 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -106,7 +106,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count) if (unlikely(!entry)) return NULL; - fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL); if (unlikely(!fields)) { kfree(entry); return NULL; @@ -160,7 +160,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES]; int __init audit_register_class(int class, unsigned *list) { - __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL); if (!p) return -ENOMEM; while (*list != ~0U) { diff --git a/kernel/bounds.c b/kernel/bounds.c index 9fd4246b04b8..e1d1d1952bfa 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <linux/mmzone.h> #include <linux/kbuild.h> -#include <linux/page_cgroup.h> #include <linux/log2.h> #include <linux/spinlock_types.h> @@ -18,7 +17,6 @@ void foo(void) /* The enum constants to put into include/generated/bounds.h */ DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); - DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 6f3254e8c137..1d0af8a2c646 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* For mmu_notifiers */ const unsigned long mmun_start = addr; const unsigned long mmun_end = addr + PAGE_SIZE; + struct mem_cgroup *memcg; + + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + if (err) + return err; /* For try_to_free_swap() and munlock_vma_page() below */ lock_page(page); @@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); + mem_cgroup_commit_charge(kpage, memcg, false); + lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { dec_mm_counter(mm, MM_FILEPAGES); @@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: + mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -315,18 +323,11 @@ retry: if (!new_page) goto put_old; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) - goto put_new; - __SetPageUptodate(new_page); copy_highpage(new_page, old_page); copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - if (ret) - mem_cgroup_uncharge_page(new_page); - -put_new: page_cache_release(new_page); put_old: put_page(old_page); diff --git a/kernel/exit.c b/kernel/exit.c index e5c4668f1799..20a9c1003a1d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,7 +59,7 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -151,7 +151,7 @@ static void __exit_signal(struct task_struct *tsk) spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +192,8 @@ repeat: */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -241,7 +242,8 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +296,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -405,7 +407,7 @@ assign_new_owner: * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -425,6 +427,7 @@ static void exit_mm(struct task_struct * tsk) core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -565,6 +568,7 @@ static void forget_original_parent(struct task_struct *father) list_for_each_entry_safe(p, n, &father->children, sibling) { struct task_struct *t = p; + do { t->real_parent = reaper; if (t->parent == father) { @@ -598,7 +602,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead) /* * This does two things: * - * A. Make init inherit all the child processes + * A. Make init inherit all the child processes * B. Check to see if any process groups have become orphaned * as a result of our exiting, and if they have any stopped * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) @@ -648,9 +652,8 @@ static void check_stack_usage(void) spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -691,8 +694,7 @@ void do_exit(long code) * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -716,9 +718,9 @@ void do_exit(long code) raw_spin_unlock_wait(&tsk->pi_lock); if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ @@ -836,7 +838,6 @@ void do_exit(long code) for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -846,7 +847,6 @@ void complete_and_exit(struct completion *comp, long code) do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -869,6 +869,7 @@ do_group_exit(int exit_code) exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -1033,9 +1034,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) * as other threads in the parent group can be right * here reaping other children at the same time. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); spin_lock_irq(&p->real_parent->sighand->siglock); @@ -1417,6 +1418,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1430,6 +1432,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } diff --git a/kernel/fork.c b/kernel/fork.c index 8d1063ad74df..197c42d44bef 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -365,12 +365,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) */ down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - mm->locked_vm = 0; - mm->mmap = NULL; - mm->vmacache_seqnum = 0; - mm->map_count = 0; - cpumask_clear(mm_cpumask(mm)); - mm->mm_rb = RB_ROOT; + mm->total_vm = oldmm->total_vm; + mm->shared_vm = oldmm->shared_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + rb_link = &mm->mm_rb.rb_node; rb_parent = NULL; pprev = &mm->mmap; @@ -527,19 +526,37 @@ static void mm_init_aio(struct mm_struct *mm) #endif } +static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ +#ifdef CONFIG_MEMCG + mm->owner = p; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) { + mm->mmap = NULL; + mm->mm_rb = RB_ROOT; + mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); + mm->map_count = 0; + mm->locked_vm = 0; + mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + mmu_notifier_mm_init(mm); clear_tlb_flush_pending(mm); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + mm->pmd_huge_pte = NULL; +#endif if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; @@ -549,11 +566,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->def_flags = 0; } - if (likely(!mm_alloc_pgd(mm))) { - mmu_notifier_mm_init(mm); - return mm; - } + if (mm_alloc_pgd(mm)) + goto fail_nopgd; + + if (init_new_context(p, mm)) + goto fail_nocontext; + + return mm; +fail_nocontext: + mm_free_pgd(mm); +fail_nopgd: free_mm(mm); return NULL; } @@ -587,7 +610,6 @@ struct mm_struct *mm_alloc(void) return NULL; memset(mm, 0, sizeof(*mm)); - mm_init_cpumask(mm); return mm_init(mm, current); } @@ -819,17 +841,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); - mm_init_cpumask(mm); -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - mm->pmd_huge_pte = NULL; -#endif if (!mm_init(mm, tsk)) goto fail_nomem; - if (init_new_context(tsk, mm)) - goto fail_nocontext; - dup_mm_exe_file(oldmm, mm); err = dup_mmap(mm, oldmm); @@ -851,15 +866,6 @@ free_pt: fail_nomem: return NULL; - -fail_nocontext: - /* - * If init_new_context() failed, we cannot use mmput() to free the mm - * because it calls destroy_context() - */ - mm_free_pgd(mm); - free_mm(mm); - return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) @@ -1098,13 +1104,6 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MEMCG -void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ - mm->owner = p; -} -#endif /* CONFIG_MEMCG */ - /* * Initialize POSIX timer handling for a single task. */ @@ -1306,10 +1305,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; @@ -1327,6 +1322,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (retval) goto bad_fork_cleanup_policy; /* copy all the process information */ + shm_init_task(p); retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_audit; @@ -1872,6 +1868,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ exit_sem(current); } + if (unshare_flags & CLONE_NEWIPC) { + /* Orphan segments in old ns (see sem above). */ + exit_shm(current); + shm_init_task(current); + } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 15ff01a76379..edf67c493a8e 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -784,8 +784,7 @@ static __init int gcov_fs_init(void) err_remove: pr_err("init failed\n"); - if (root_node.dentry) - debugfs_remove(root_node.dentry); + debugfs_remove(root_node.dentry); return rc; } diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index cb0cf37dac3a..ae5167087845 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address, address += symbol_offset; name = kallsyms_lookup(address, &size, &offset, &modname, buffer); if (!name) - return sprintf(buffer, "0x%lx", address); + return sprintf(buffer, "0x%lx", address - symbol_offset); if (name != buffer) strcpy(buffer, name); diff --git a/kernel/kexec.c b/kernel/kexec.c index 369f41a94124..c913c97ea960 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) "kexec: " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -39,6 +41,9 @@ #include <asm/io.h> #include <asm/sections.h> +#include <crypto/hash.h> +#include <crypto/sha.h> + /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; @@ -51,6 +56,15 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; +/* + * Declare these symbols weak so that if architecture provides a purgatory, + * these will be overridden. + */ +char __weak kexec_purgatory[0]; +size_t __weak kexec_purgatory_size = 0; + +static int kexec_calculate_store_digests(struct kimage *image); + /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { .name = "Crash kernel", @@ -124,45 +138,27 @@ static struct page *kimage_alloc_page(struct kimage *image, gfp_t gfp_mask, unsigned long dest); -static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_user_segment_list(struct kimage *image, + unsigned long nr_segments, + struct kexec_segment __user *segments) { + int ret; size_t segment_bytes; - struct kimage *image; - unsigned long i; - int result; - - /* Allocate a controlling structure */ - result = -ENOMEM; - image = kzalloc(sizeof(*image), GFP_KERNEL); - if (!image) - goto out; - - image->head = 0; - image->entry = &image->head; - image->last_entry = &image->head; - image->control_page = ~0; /* By default this does not apply */ - image->start = entry; - image->type = KEXEC_TYPE_DEFAULT; - - /* Initialize the list of control pages */ - INIT_LIST_HEAD(&image->control_pages); - - /* Initialize the list of destination pages */ - INIT_LIST_HEAD(&image->dest_pages); - - /* Initialize the list of unusable pages */ - INIT_LIST_HEAD(&image->unuseable_pages); /* Read in the segments */ image->nr_segments = nr_segments; segment_bytes = nr_segments * sizeof(*segments); - result = copy_from_user(image->segment, segments, segment_bytes); - if (result) { - result = -EFAULT; - goto out; - } + ret = copy_from_user(image->segment, segments, segment_bytes); + if (ret) + ret = -EFAULT; + + return ret; +} + +static int sanity_check_segment_list(struct kimage *image) +{ + int result, i; + unsigned long nr_segments = image->nr_segments; /* * Verify we have good destination addresses. The caller is @@ -184,9 +180,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) - goto out; + return result; if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) - goto out; + return result; } /* Verify our destination addresses do not overlap. @@ -207,7 +203,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, pend = pstart + image->segment[j].memsz; /* Do the segments overlap ? */ if ((mend > pstart) && (mstart < pend)) - goto out; + return result; } } @@ -219,130 +215,386 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EINVAL; for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) - goto out; + return result; } - result = 0; -out: - if (result == 0) - *rimage = image; - else - kfree(image); + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ - return result; + if (image->type == KEXEC_TYPE_CRASH) { + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || + (mend > crashk_res.end)) + return result; + } + } + return 0; +} + +static struct kimage *do_kimage_alloc_init(void) +{ + struct kimage *image; + + /* Allocate a controlling structure */ + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + return NULL; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unusable_pages); + + return image; } static void kimage_free_page_list(struct list_head *list); -static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments, + unsigned long flags) { - int result; + int ret; struct kimage *image; + bool kexec_on_panic = flags & KEXEC_ON_CRASH; + + if (kexec_on_panic) { + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) + return -EADDRNOTAVAIL; + } /* Allocate and initialize a controlling structure */ - image = NULL; - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) - goto out; + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->start = entry; + + ret = copy_user_segment_list(image, nr_segments, segments); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + + /* Enable the special crash kernel control page allocation policy. */ + if (kexec_on_panic) { + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be * counted as destination pages. */ - result = -ENOMEM; + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_image; } - image->swap_page = kimage_alloc_control_pages(image, 0); - if (!image->swap_page) { - pr_err("Could not allocate swap buffer\n"); - goto out_free; + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err("Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: kimage_free_page_list(&image->control_pages); +out_free_image: kfree(image); -out: - return result; + return ret; } -static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, - struct kexec_segment __user *segments) +static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) { - int result; - struct kimage *image; - unsigned long i; + struct fd f = fdget(fd); + int ret; + struct kstat stat; + loff_t pos; + ssize_t bytes = 0; - image = NULL; - /* Verify we have a valid entry point */ - if ((entry < crashk_res.start) || (entry > crashk_res.end)) { - result = -EADDRNOTAVAIL; + if (!f.file) + return -EBADF; + + ret = vfs_getattr(&f.file->f_path, &stat); + if (ret) + goto out; + + if (stat.size > INT_MAX) { + ret = -EFBIG; goto out; } - /* Allocate and initialize a controlling structure */ - result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + ret = -EINVAL; goto out; + } - /* Enable the special crash kernel control page - * allocation policy. - */ - image->control_page = crashk_res.start; - image->type = KEXEC_TYPE_CRASH; + *buf = vmalloc(stat.size); + if (!*buf) { + ret = -ENOMEM; + goto out; + } - /* - * Verify we have good destination addresses. Normally - * the caller is responsible for making certain we don't - * attempt to load the new image into invalid or reserved - * areas of RAM. But crash kernels are preloaded into a - * reserved area of ram. We must ensure the addresses - * are in the reserved area otherwise preloading the - * kernel could corrupt things. - */ - result = -EADDRNOTAVAIL; - for (i = 0; i < nr_segments; i++) { - unsigned long mstart, mend; + pos = 0; + while (pos < stat.size) { + bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, + stat.size - pos); + if (bytes < 0) { + vfree(*buf); + ret = bytes; + goto out; + } - mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz - 1; - /* Ensure we are within the crash kernel limits */ - if ((mstart < crashk_res.start) || (mend > crashk_res.end)) - goto out_free; + if (bytes == 0) + break; + pos += bytes; } + if (pos != stat.size) { + ret = -EBADF; + vfree(*buf); + goto out; + } + + *buf_len = pos; +out: + fdput(f); + return ret; +} + +/* Architectures can provide this probe function */ +int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, + unsigned long buf_len) +{ + return -ENOEXEC; +} + +void * __weak arch_kexec_kernel_image_load(struct kimage *image) +{ + return ERR_PTR(-ENOEXEC); +} + +void __weak arch_kimage_file_post_load_cleanup(struct kimage *image) +{ +} + +/* Apply relocations of type RELA */ +int __weak +arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("RELA relocation unsupported.\n"); + return -ENOEXEC; +} + +/* Apply relocations of type REL */ +int __weak +arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, + unsigned int relsec) +{ + pr_err("REL relocation unsupported.\n"); + return -ENOEXEC; +} + +/* + * Free up memory used by kernel, initrd, and comand line. This is temporary + * memory allocation which is not needed any more after these buffers have + * been loaded into separate segments and have been copied elsewhere. + */ +static void kimage_file_post_load_cleanup(struct kimage *image) +{ + struct purgatory_info *pi = &image->purgatory_info; + + vfree(image->kernel_buf); + image->kernel_buf = NULL; + + vfree(image->initrd_buf); + image->initrd_buf = NULL; + + kfree(image->cmdline_buf); + image->cmdline_buf = NULL; + + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; + + vfree(pi->sechdrs); + pi->sechdrs = NULL; + + /* See if architecture has anything to cleanup post load */ + arch_kimage_file_post_load_cleanup(image); + /* - * Find a location for the control code buffer, and add - * the vector of segments so that it's pages will also be - * counted as destination pages. + * Above call should have called into bootloader to free up + * any data stored in kimage->image_loader_data. It should + * be ok now to free it up. */ - result = -ENOMEM; + kfree(image->image_loader_data); + image->image_loader_data = NULL; +} + +/* + * In file mode list of segments is prepared by kernel. Copy relevant + * data from user space, do error checking, prepare segment list + */ +static int +kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, + const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned flags) +{ + int ret = 0; + void *ldata; + + ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, + &image->kernel_buf_len); + if (ret) + return ret; + + /* Call arch image probe handlers */ + ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, + image->kernel_buf_len); + + if (ret) + goto out; + + /* It is possible that there no initramfs is being loaded */ + if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { + ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, + &image->initrd_buf_len); + if (ret) + goto out; + } + + if (cmdline_len) { + image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL); + if (!image->cmdline_buf) { + ret = -ENOMEM; + goto out; + } + + ret = copy_from_user(image->cmdline_buf, cmdline_ptr, + cmdline_len); + if (ret) { + ret = -EFAULT; + goto out; + } + + image->cmdline_buf_len = cmdline_len; + + /* command line should be a string with last byte null */ + if (image->cmdline_buf[cmdline_len - 1] != '\0') { + ret = -EINVAL; + goto out; + } + } + + /* Call arch image load handlers */ + ldata = arch_kexec_kernel_image_load(image); + + if (IS_ERR(ldata)) { + ret = PTR_ERR(ldata); + goto out; + } + + image->image_loader_data = ldata; +out: + /* In case of error, free up all allocated memory in this function */ + if (ret) + kimage_file_post_load_cleanup(image); + return ret; +} + +static int +kimage_file_alloc_init(struct kimage **rimage, int kernel_fd, + int initrd_fd, const char __user *cmdline_ptr, + unsigned long cmdline_len, unsigned long flags) +{ + int ret; + struct kimage *image; + bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH; + + image = do_kimage_alloc_init(); + if (!image) + return -ENOMEM; + + image->file_mode = 1; + + if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + } + + ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd, + cmdline_ptr, cmdline_len, flags); + if (ret) + goto out_free_image; + + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_post_load_bufs; + + ret = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { pr_err("Could not allocate control_code_buffer\n"); - goto out_free; + goto out_free_post_load_bufs; + } + + if (!kexec_on_panic) { + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + pr_err(KERN_ERR "Could not allocate swap buffer\n"); + goto out_free_control_pages; + } } *rimage = image; return 0; - -out_free: +out_free_control_pages: + kimage_free_page_list(&image->control_pages); +out_free_post_load_bufs: + kimage_file_post_load_cleanup(image); +out_free_image: kfree(image); -out: - return result; + return ret; } static int kimage_is_destination_range(struct kimage *image, @@ -608,7 +860,7 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->dest_pages); /* Walk through and free any unusable pages I have cached */ - kimage_free_page_list(&image->unuseable_pages); + kimage_free_page_list(&image->unusable_pages); } static void kimage_terminate(struct kimage *image) @@ -662,6 +914,14 @@ static void kimage_free(struct kimage *image) /* Free the kexec control pages... */ kimage_free_page_list(&image->control_pages); + + /* + * Free up any temporary buffers allocated. This might hit if + * error occurred much later after buffer allocation. + */ + if (image->file_mode) + kimage_file_post_load_cleanup(image); + kfree(image); } @@ -731,7 +991,7 @@ static struct page *kimage_alloc_page(struct kimage *image, /* If the page cannot be used file it away */ if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { - list_add(&page->lru, &image->unuseable_pages); + list_add(&page->lru, &image->unusable_pages); continue; } addr = page_to_pfn(page) << PAGE_SHIFT; @@ -790,10 +1050,14 @@ static int kimage_load_normal_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -825,7 +1089,11 @@ static int kimage_load_normal_segment(struct kimage *image, PAGE_SIZE - (maddr & ~PAGE_MASK)); uchunk = min(ubytes, mchunk); - result = copy_from_user(ptr, buf, uchunk); + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { result = -EFAULT; @@ -833,7 +1101,10 @@ static int kimage_load_normal_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -850,10 +1121,14 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; size_t ubytes, mbytes; int result; - unsigned char __user *buf; + unsigned char __user *buf = NULL; + unsigned char *kbuf = NULL; result = 0; - buf = segment->buf; + if (image->file_mode) + kbuf = segment->kbuf; + else + buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; @@ -876,7 +1151,12 @@ static int kimage_load_crash_segment(struct kimage *image, /* Zero the trailing part of the page */ memset(ptr + uchunk, 0, mchunk - uchunk); } - result = copy_from_user(ptr, buf, uchunk); + + /* For file based kexec, source pages are in kernel memory */ + if (image->file_mode) + memcpy(ptr, kbuf, uchunk); + else + result = copy_from_user(ptr, buf, uchunk); kexec_flush_icache_page(page); kunmap(page); if (result) { @@ -885,7 +1165,10 @@ static int kimage_load_crash_segment(struct kimage *image, } ubytes -= uchunk; maddr += mchunk; - buf += mchunk; + if (image->file_mode) + kbuf += mchunk; + else + buf += mchunk; mbytes -= mchunk; } out: @@ -985,16 +1268,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, /* Loading another kernel to reboot into */ if ((flags & KEXEC_ON_CRASH) == 0) - result = kimage_normal_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, - nr_segments, segments); + result = kimage_alloc_init(&image, entry, nr_segments, + segments, flags); crash_map_reserved_pages(); } if (result) @@ -1076,6 +1359,82 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, } #endif +SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd, + unsigned long, cmdline_len, const char __user *, cmdline_ptr, + unsigned long, flags) +{ + int ret = 0, i; + struct kimage **dest_image, *image; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT) || kexec_load_disabled) + return -EPERM; + + /* Make sure we have a legal set of flags */ + if (flags != (flags & KEXEC_FILE_FLAGS)) + return -EINVAL; + + image = NULL; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_FILE_ON_CRASH) + dest_image = &kexec_crash_image; + + if (flags & KEXEC_FILE_UNLOAD) + goto exchange; + + /* + * In case of crash, new kernel gets loaded in reserved region. It is + * same memory where old crash kernel might be loaded. Free any + * current crash dump kernel before we corrupt it. + */ + if (flags & KEXEC_FILE_ON_CRASH) + kimage_free(xchg(&kexec_crash_image, NULL)); + + ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr, + cmdline_len, flags); + if (ret) + goto out; + + ret = machine_kexec_prepare(image); + if (ret) + goto out; + + ret = kexec_calculate_store_digests(image); + if (ret) + goto out; + + for (i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n", + i, ksegment->buf, ksegment->bufsz, ksegment->mem, + ksegment->memsz); + + ret = kimage_load_segment(image, &image->segment[i]); + if (ret) + goto out; + } + + kimage_terminate(image); + + /* + * Free up any temporary buffers allocated which are not needed + * after image has been loaded + */ + kimage_file_post_load_cleanup(image); +exchange: + image = xchg(dest_image, image); +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + return ret; +} + void crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load @@ -1628,6 +1987,683 @@ static int __init crash_save_vmcoreinfo_init(void) subsys_initcall(crash_save_vmcoreinfo_init); +static int __kexec_add_segment(struct kimage *image, char *buf, + unsigned long bufsz, unsigned long mem, + unsigned long memsz) +{ + struct kexec_segment *ksegment; + + ksegment = &image->segment[image->nr_segments]; + ksegment->kbuf = buf; + ksegment->bufsz = bufsz; + ksegment->mem = mem; + ksegment->memsz = memsz; + image->nr_segments++; + + return 0; +} + +static int locate_mem_hole_top_down(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_end = min(end, kbuf->buf_max); + temp_start = temp_end - kbuf->memsz; + + do { + /* align down start */ + temp_start = temp_start & (~(kbuf->buf_align - 1)); + + if (temp_start < start || temp_start < kbuf->buf_min) + return 0; + + temp_end = temp_start + kbuf->memsz - 1; + + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start - PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end, + struct kexec_buf *kbuf) +{ + struct kimage *image = kbuf->image; + unsigned long temp_start, temp_end; + + temp_start = max(start, kbuf->buf_min); + + do { + temp_start = ALIGN(temp_start, kbuf->buf_align); + temp_end = temp_start + kbuf->memsz - 1; + + if (temp_end > end || temp_end > kbuf->buf_max) + return 0; + /* + * Make sure this does not conflict with any of existing + * segments + */ + if (kimage_is_destination_range(image, temp_start, temp_end)) { + temp_start = temp_start + PAGE_SIZE; + continue; + } + + /* We found a suitable memory range */ + break; + } while (1); + + /* If we are here, we found a suitable memory range */ + __kexec_add_segment(image, kbuf->buffer, kbuf->bufsz, temp_start, + kbuf->memsz); + + /* Success, stop navigating through remaining System RAM ranges */ + return 1; +} + +static int locate_mem_hole_callback(u64 start, u64 end, void *arg) +{ + struct kexec_buf *kbuf = (struct kexec_buf *)arg; + unsigned long sz = end - start + 1; + + /* Returning 0 will take to next memory range */ + if (sz < kbuf->memsz) + return 0; + + if (end < kbuf->buf_min || start > kbuf->buf_max) + return 0; + + /* + * Allocate memory top down with-in ram range. Otherwise bottom up + * allocation. + */ + if (kbuf->top_down) + return locate_mem_hole_top_down(start, end, kbuf); + return locate_mem_hole_bottom_up(start, end, kbuf); +} + +/* + * Helper function for placing a buffer in a kexec segment. This assumes + * that kexec_mutex is held. + */ +int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, + unsigned long memsz, unsigned long buf_align, + unsigned long buf_min, unsigned long buf_max, + bool top_down, unsigned long *load_addr) +{ + + struct kexec_segment *ksegment; + struct kexec_buf buf, *kbuf; + int ret; + + /* Currently adding segment this way is allowed only in file mode */ + if (!image->file_mode) + return -EINVAL; + + if (image->nr_segments >= KEXEC_SEGMENT_MAX) + return -EINVAL; + + /* + * Make sure we are not trying to add buffer after allocating + * control pages. All segments need to be placed first before + * any control pages are allocated. As control page allocation + * logic goes through list of segments to make sure there are + * no destination overlaps. + */ + if (!list_empty(&image->control_pages)) { + WARN_ON(1); + return -EINVAL; + } + + memset(&buf, 0, sizeof(struct kexec_buf)); + kbuf = &buf; + kbuf->image = image; + kbuf->buffer = buffer; + kbuf->bufsz = bufsz; + + kbuf->memsz = ALIGN(memsz, PAGE_SIZE); + kbuf->buf_align = max(buf_align, PAGE_SIZE); + kbuf->buf_min = buf_min; + kbuf->buf_max = buf_max; + kbuf->top_down = top_down; + + /* Walk the RAM ranges and allocate a suitable range for the buffer */ + if (image->type == KEXEC_TYPE_CRASH) + ret = walk_iomem_res("Crash kernel", + IORESOURCE_MEM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); + else + ret = walk_system_ram_res(0, -1, kbuf, + locate_mem_hole_callback); + if (ret != 1) { + /* A suitable memory range could not be found for buffer */ + return -EADDRNOTAVAIL; + } + + /* Found a suitable memory range */ + ksegment = &image->segment[image->nr_segments - 1]; + *load_addr = ksegment->mem; + return 0; +} + +/* Calculate and store the digest of segments */ +static int kexec_calculate_store_digests(struct kimage *image) +{ + struct crypto_shash *tfm; + struct shash_desc *desc; + int ret = 0, i, j, zero_buf_sz, sha_region_sz; + size_t desc_size, nullsz; + char *digest; + void *zero_buf; + struct kexec_sha_region *sha_regions; + struct purgatory_info *pi = &image->purgatory_info; + + zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT); + zero_buf_sz = PAGE_SIZE; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + ret = PTR_ERR(tfm); + goto out; + } + + desc_size = crypto_shash_descsize(tfm) + sizeof(*desc); + desc = kzalloc(desc_size, GFP_KERNEL); + if (!desc) { + ret = -ENOMEM; + goto out_free_tfm; + } + + sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region); + sha_regions = vzalloc(sha_region_sz); + if (!sha_regions) + goto out_free_desc; + + desc->tfm = tfm; + desc->flags = 0; + + ret = crypto_shash_init(desc); + if (ret < 0) + goto out_free_sha_regions; + + digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); + if (!digest) { + ret = -ENOMEM; + goto out_free_sha_regions; + } + + for (j = i = 0; i < image->nr_segments; i++) { + struct kexec_segment *ksegment; + + ksegment = &image->segment[i]; + /* + * Skip purgatory as it will be modified once we put digest + * info in purgatory. + */ + if (ksegment->kbuf == pi->purgatory_buf) + continue; + + ret = crypto_shash_update(desc, ksegment->kbuf, + ksegment->bufsz); + if (ret) + break; + + /* + * Assume rest of the buffer is filled with zero and + * update digest accordingly. + */ + nullsz = ksegment->memsz - ksegment->bufsz; + while (nullsz) { + unsigned long bytes = nullsz; + + if (bytes > zero_buf_sz) + bytes = zero_buf_sz; + ret = crypto_shash_update(desc, zero_buf, bytes); + if (ret) + break; + nullsz -= bytes; + } + + if (ret) + break; + + sha_regions[j].start = ksegment->mem; + sha_regions[j].len = ksegment->memsz; + j++; + } + + if (!ret) { + ret = crypto_shash_final(desc, digest); + if (ret) + goto out_free_digest; + ret = kexec_purgatory_get_set_symbol(image, "sha_regions", + sha_regions, sha_region_sz, 0); + if (ret) + goto out_free_digest; + + ret = kexec_purgatory_get_set_symbol(image, "sha256_digest", + digest, SHA256_DIGEST_SIZE, 0); + if (ret) + goto out_free_digest; + } + +out_free_digest: + kfree(digest); +out_free_sha_regions: + vfree(sha_regions); +out_free_desc: + kfree(desc); +out_free_tfm: + kfree(tfm); +out: + return ret; +} + +/* Actually load purgatory. Lot of code taken from kexec-tools */ +static int __kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down) +{ + struct purgatory_info *pi = &image->purgatory_info; + unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad; + unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset; + unsigned char *buf_addr, *src; + int i, ret = 0, entry_sidx = -1; + const Elf_Shdr *sechdrs_c; + Elf_Shdr *sechdrs = NULL; + void *purgatory_buf = NULL; + + /* + * sechdrs_c points to section headers in purgatory and are read + * only. No modifications allowed. + */ + sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff; + + /* + * We can not modify sechdrs_c[] and its fields. It is read only. + * Copy it over to a local copy where one can store some temporary + * data and free it at the end. We need to modify ->sh_addr and + * ->sh_offset fields to keep track of permanent and temporary + * locations of sections. + */ + sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + if (!sechdrs) + return -ENOMEM; + + memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr)); + + /* + * We seem to have multiple copies of sections. First copy is which + * is embedded in kernel in read only section. Some of these sections + * will be copied to a temporary buffer and relocated. And these + * sections will finally be copied to their final destination at + * segment load time. + * + * Use ->sh_offset to reflect section address in memory. It will + * point to original read only copy if section is not allocatable. + * Otherwise it will point to temporary copy which will be relocated. + * + * Use ->sh_addr to contain final address of the section where it + * will go during execution time. + */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type == SHT_NOBITS) + continue; + + sechdrs[i].sh_offset = (unsigned long)pi->ehdr + + sechdrs[i].sh_offset; + } + + /* + * Identify entry point section and make entry relative to section + * start. + */ + entry = pi->ehdr->e_entry; + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + if (!(sechdrs[i].sh_flags & SHF_EXECINSTR)) + continue; + + /* Make entry section relative */ + if (sechdrs[i].sh_addr <= pi->ehdr->e_entry && + ((sechdrs[i].sh_addr + sechdrs[i].sh_size) > + pi->ehdr->e_entry)) { + entry_sidx = i; + entry -= sechdrs[i].sh_addr; + break; + } + } + + /* Determine how much memory is needed to load relocatable object. */ + buf_align = 1; + bss_align = 1; + buf_sz = 0; + bss_sz = 0; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + if (buf_align < align) + buf_align = align; + buf_sz = ALIGN(buf_sz, align); + buf_sz += sechdrs[i].sh_size; + } else { + /* bss section */ + if (bss_align < align) + bss_align = align; + bss_sz = ALIGN(bss_sz, align); + bss_sz += sechdrs[i].sh_size; + } + } + + /* Determine the bss padding required to align bss properly */ + bss_pad = 0; + if (buf_sz & (bss_align - 1)) + bss_pad = bss_align - (buf_sz & (bss_align - 1)); + + memsz = buf_sz + bss_pad + bss_sz; + + /* Allocate buffer for purgatory */ + purgatory_buf = vzalloc(buf_sz); + if (!purgatory_buf) { + ret = -ENOMEM; + goto out; + } + + if (buf_align < bss_align) + buf_align = bss_align; + + /* Add buffer to segment list */ + ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz, + buf_align, min, max, top_down, + &pi->purgatory_load_addr); + if (ret) + goto out; + + /* Load SHF_ALLOC sections */ + buf_addr = purgatory_buf; + load_addr = curr_load_addr = pi->purgatory_load_addr; + bss_addr = load_addr + buf_sz + bss_pad; + + for (i = 0; i < pi->ehdr->e_shnum; i++) { + if (!(sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + + align = sechdrs[i].sh_addralign; + if (sechdrs[i].sh_type != SHT_NOBITS) { + curr_load_addr = ALIGN(curr_load_addr, align); + offset = curr_load_addr - load_addr; + /* We already modifed ->sh_offset to keep src addr */ + src = (char *) sechdrs[i].sh_offset; + memcpy(buf_addr + offset, src, sechdrs[i].sh_size); + + /* Store load address and source address of section */ + sechdrs[i].sh_addr = curr_load_addr; + + /* + * This section got copied to temporary buffer. Update + * ->sh_offset accordingly. + */ + sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset); + + /* Advance to the next address */ + curr_load_addr += sechdrs[i].sh_size; + } else { + bss_addr = ALIGN(bss_addr, align); + sechdrs[i].sh_addr = bss_addr; + bss_addr += sechdrs[i].sh_size; + } + } + + /* Update entry point based on load address of text section */ + if (entry_sidx >= 0) + entry += sechdrs[entry_sidx].sh_addr; + + /* Make kernel jump to purgatory after shutdown */ + image->start = entry; + + /* Used later to get/set symbol values */ + pi->sechdrs = sechdrs; + + /* + * Used later to identify which section is purgatory and skip it + * from checksumming. + */ + pi->purgatory_buf = purgatory_buf; + return ret; +out: + vfree(sechdrs); + vfree(purgatory_buf); + return ret; +} + +static int kexec_apply_relocations(struct kimage *image) +{ + int i, ret; + struct purgatory_info *pi = &image->purgatory_info; + Elf_Shdr *sechdrs = pi->sechdrs; + + /* Apply relocations */ + for (i = 0; i < pi->ehdr->e_shnum; i++) { + Elf_Shdr *section, *symtab; + + if (sechdrs[i].sh_type != SHT_RELA && + sechdrs[i].sh_type != SHT_REL) + continue; + + /* + * For section of type SHT_RELA/SHT_REL, + * ->sh_link contains section header index of associated + * symbol table. And ->sh_info contains section header + * index of section to which relocations apply. + */ + if (sechdrs[i].sh_info >= pi->ehdr->e_shnum || + sechdrs[i].sh_link >= pi->ehdr->e_shnum) + return -ENOEXEC; + + section = &sechdrs[sechdrs[i].sh_info]; + symtab = &sechdrs[sechdrs[i].sh_link]; + + if (!(section->sh_flags & SHF_ALLOC)) + continue; + + /* + * symtab->sh_link contain section header index of associated + * string table. + */ + if (symtab->sh_link >= pi->ehdr->e_shnum) + /* Invalid section number? */ + continue; + + /* + * Respective archicture needs to provide support for applying + * relocations of type SHT_RELA/SHT_REL. + */ + if (sechdrs[i].sh_type == SHT_RELA) + ret = arch_kexec_apply_relocations_add(pi->ehdr, + sechdrs, i); + else if (sechdrs[i].sh_type == SHT_REL) + ret = arch_kexec_apply_relocations(pi->ehdr, + sechdrs, i); + if (ret) + return ret; + } + + return 0; +} + +/* Load relocatable purgatory object and relocate it appropriately */ +int kexec_load_purgatory(struct kimage *image, unsigned long min, + unsigned long max, int top_down, + unsigned long *load_addr) +{ + struct purgatory_info *pi = &image->purgatory_info; + int ret; + + if (kexec_purgatory_size <= 0) + return -EINVAL; + + if (kexec_purgatory_size < sizeof(Elf_Ehdr)) + return -ENOEXEC; + + pi->ehdr = (Elf_Ehdr *)kexec_purgatory; + + if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0 + || pi->ehdr->e_type != ET_REL + || !elf_check_arch(pi->ehdr) + || pi->ehdr->e_shentsize != sizeof(Elf_Shdr)) + return -ENOEXEC; + + if (pi->ehdr->e_shoff >= kexec_purgatory_size + || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) > + kexec_purgatory_size - pi->ehdr->e_shoff)) + return -ENOEXEC; + + ret = __kexec_load_purgatory(image, min, max, top_down); + if (ret) + return ret; + + ret = kexec_apply_relocations(image); + if (ret) + goto out; + + *load_addr = pi->purgatory_load_addr; + return 0; +out: + vfree(pi->sechdrs); + vfree(pi->purgatory_buf); + return ret; +} + +static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi, + const char *name) +{ + Elf_Sym *syms; + Elf_Shdr *sechdrs; + Elf_Ehdr *ehdr; + int i, k; + const char *strtab; + + if (!pi->sechdrs || !pi->ehdr) + return NULL; + + sechdrs = pi->sechdrs; + ehdr = pi->ehdr; + + for (i = 0; i < ehdr->e_shnum; i++) { + if (sechdrs[i].sh_type != SHT_SYMTAB) + continue; + + if (sechdrs[i].sh_link >= ehdr->e_shnum) + /* Invalid strtab section number */ + continue; + strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset; + syms = (Elf_Sym *)sechdrs[i].sh_offset; + + /* Go through symbols for a match */ + for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) { + if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL) + continue; + + if (strcmp(strtab + syms[k].st_name, name) != 0) + continue; + + if (syms[k].st_shndx == SHN_UNDEF || + syms[k].st_shndx >= ehdr->e_shnum) { + pr_debug("Symbol: %s has bad section index %d.\n", + name, syms[k].st_shndx); + return NULL; + } + + /* Found the symbol we are looking for */ + return &syms[k]; + } + } + + return NULL; +} + +void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name) +{ + struct purgatory_info *pi = &image->purgatory_info; + Elf_Sym *sym; + Elf_Shdr *sechdr; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return ERR_PTR(-EINVAL); + + sechdr = &pi->sechdrs[sym->st_shndx]; + + /* + * Returns the address where symbol will finally be loaded after + * kexec_load_segment() + */ + return (void *)(sechdr->sh_addr + sym->st_value); +} + +/* + * Get or set value of a symbol. If "get_value" is true, symbol value is + * returned in buf otherwise symbol value is set based on value in buf. + */ +int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name, + void *buf, unsigned int size, bool get_value) +{ + Elf_Sym *sym; + Elf_Shdr *sechdrs; + struct purgatory_info *pi = &image->purgatory_info; + char *sym_buf; + + sym = kexec_purgatory_find_symbol(pi, name); + if (!sym) + return -EINVAL; + + if (sym->st_size != size) { + pr_err("symbol %s size mismatch: expected %lu actual %u\n", + name, (unsigned long)sym->st_size, size); + return -EINVAL; + } + + sechdrs = pi->sechdrs; + + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + pr_err("symbol %s is in a bss section. Cannot %s\n", name, + get_value ? "get" : "set"); + return -EINVAL; + } + + sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset + + sym->st_value; + + if (get_value) + memcpy((void *)buf, sym_buf, size); + else + memcpy((void *)sym_buf, buf, size); + + return 0; +} + /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/panic.c b/kernel/panic.c index 62e16cef9cc2..d09dc5c32c67 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -224,6 +224,7 @@ static const struct tnt tnts[] = { { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, { TAINT_OOT_MODULE, 'O', ' ' }, { TAINT_UNSIGNED_MODULE, 'E', ' ' }, + { TAINT_SOFTLOCKUP, 'L', ' ' }, }; /** diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 13e839dbca07..5eb0e6c800bb 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -45,6 +45,7 @@ #include <linux/poll.h> #include <linux/irq_work.h> #include <linux/utsname.h> +#include <linux/ctype.h> #include <asm/uaccess.h> @@ -56,7 +57,7 @@ int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ - DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */ CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; @@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip) * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ - * hold it are racing, but it helps tracking those weird code - * path in the console code where we end up in places I want - * locked without the console sempahore held + * hold it and are racing, but it helps tracking those weird code + * paths in the console code where we end up in places I want + * locked without the console sempahore held). */ static int console_locked, console_suspended; @@ -146,8 +147,8 @@ static int console_may_schedule; * the overall length of the record. * * The heads to the first and last entry in the buffer, as well as the - * sequence numbers of these both entries are maintained when messages - * are stored.. + * sequence numbers of these entries are maintained when messages are + * stored. * * If the heads indicate available messages, the length in the header * tells the start next message. A length == 0 for the next message @@ -257,7 +258,7 @@ static u64 clear_seq; static u32 clear_idx; #define PREFIX_MAX 32 -#define LOG_LINE_MAX 1024 - PREFIX_MAX +#define LOG_LINE_MAX (1024 - PREFIX_MAX) /* record buffer */ #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) @@ -266,6 +267,7 @@ static u32 clear_idx; #define LOG_ALIGN __alignof__(struct printk_log) #endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; @@ -344,7 +346,7 @@ static int log_make_free_space(u32 msg_size) while (log_first_seq < log_next_seq) { if (logbuf_has_space(msg_size, false)) return 0; - /* drop old messages until we have enough continuous space */ + /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } @@ -453,11 +455,7 @@ static int log_store(int facility, int level, return msg->text_len; } -#ifdef CONFIG_SECURITY_DMESG_RESTRICT -int dmesg_restrict = 1; -#else -int dmesg_restrict; -#endif +int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) { @@ -828,34 +826,74 @@ void log_buf_kexec_setup(void) /* requested log_buf_len from kernel cmdline */ static unsigned long __initdata new_log_buf_len; -/* save requested log_buf_len since it's too early to process it */ -static int __init log_buf_len_setup(char *str) +/* we practice scaling the ring buffer by powers of 2 */ +static void __init log_buf_len_update(unsigned size) { - unsigned size = memparse(str, &str); - if (size) size = roundup_pow_of_two(size); if (size > log_buf_len) new_log_buf_len = size; +} + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + log_buf_len_update(size); return 0; } early_param("log_buf_len", log_buf_len_setup); +static void __init log_buf_add_cpu(void) +{ + unsigned int cpu_extra; + + /* + * archs should set up cpu_possible_bits properly with + * set_cpu_possible() after setup_arch() but just in + * case lets ensure this is valid. + */ + if (num_possible_cpus() == 1) + return; + + cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN; + + /* by default this will only continue through for large > 64 CPUs */ + if (cpu_extra <= __LOG_BUF_LEN / 2) + return; + + pr_info("log_buf_len individual max cpu contribution: %d bytes\n", + __LOG_CPU_MAX_BUF_LEN); + pr_info("log_buf_len total cpu_extra contributions: %d bytes\n", + cpu_extra); + pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN); + + log_buf_len_update(cpu_extra + __LOG_BUF_LEN); +} + void __init setup_log_buf(int early) { unsigned long flags; char *new_log_buf; int free; + if (log_buf != __log_buf) + return; + + if (!early && !new_log_buf_len) + log_buf_add_cpu(); + if (!new_log_buf_len) return; if (early) { new_log_buf = - memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); + memblock_virt_alloc(new_log_buf_len, LOG_ALIGN); } else { - new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); + new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, + LOG_ALIGN); } if (unlikely(!new_log_buf)) { @@ -872,7 +910,7 @@ void __init setup_log_buf(int early) memcpy(log_buf, __log_buf, __LOG_BUF_LEN); raw_spin_unlock_irqrestore(&logbuf_lock, flags); - pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("log_buf_len: %d bytes\n", log_buf_len); pr_info("early log buf free: %d(%d%%)\n", free, (free * 100) / __LOG_BUF_LEN); } @@ -947,11 +985,7 @@ static inline void boot_delay_msec(int level) } #endif -#if defined(CONFIG_PRINTK_TIME) -static bool printk_time = 1; -#else -static bool printk_time; -#endif +static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME); module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); static size_t print_time(u64 ts, char *buf) @@ -1310,7 +1344,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) * for pending data, not the size; return the count of * records, not the length. */ - error = log_next_idx - syslog_idx; + error = log_next_seq - syslog_seq; } else { u64 seq = syslog_seq; u32 idx = syslog_idx; @@ -1476,7 +1510,7 @@ static struct cont { struct task_struct *owner; /* task of first print*/ u64 ts_nsec; /* time of first print */ u8 level; /* log level of first message */ - u8 facility; /* log level of first message */ + u8 facility; /* log facility of first message */ enum log_flags flags; /* prefix, newline flags */ bool flushed:1; /* buffer sealed and committed */ } cont; @@ -1802,7 +1836,7 @@ EXPORT_SYMBOL(printk); #define LOG_LINE_MAX 0 #define PREFIX_MAX 0 -#define LOG_LINE_MAX 0 + static u64 syslog_seq; static u32 syslog_idx; static u64 console_seq; @@ -1881,11 +1915,12 @@ static int __add_preferred_console(char *name, int idx, char *options, return 0; } /* - * Set up a list of consoles. Called from init/main.c + * Set up a console. Called via do_early_param() in init/main.c + * for each "console=" parameter in the boot command line. */ static int __init console_setup(char *str) { - char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */ char *s, *options, *brl_options = NULL; int idx; @@ -1902,7 +1937,8 @@ static int __init console_setup(char *str) strncpy(buf, str, sizeof(buf) - 1); } buf[sizeof(buf) - 1] = 0; - if ((options = strchr(str, ',')) != NULL) + options = strchr(str, ','); + if (options) *(options++) = 0; #ifdef __sparc__ if (!strcmp(str, "ttya")) @@ -1911,7 +1947,7 @@ static int __init console_setup(char *str) strcpy(buf, "ttyS1"); #endif for (s = buf; *s; s++) - if ((*s >= '0' && *s <= '9') || *s == ',') + if (isdigit(*s) || *s == ',') break; idx = simple_strtoul(s, NULL, 10); *s = 0; @@ -1950,7 +1986,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha i++, c++) if (strcmp(c->name, name) == 0 && c->index == idx) { strlcpy(c->name, name_new, sizeof(c->name)); - c->name[sizeof(c->name) - 1] = 0; c->options = options; c->index = idx_new; return i; @@ -2045,8 +2080,8 @@ EXPORT_SYMBOL(console_lock); /** * console_trylock - try to lock the console system for exclusive use. * - * Tried to acquire a lock which guarantees that the caller has - * exclusive access to the console system and the console_drivers list. + * Try to acquire a lock which guarantees that the caller has exclusive + * access to the console system and the console_drivers list. * * returns 1 on success, and 0 on failure to acquire the lock. */ @@ -2618,14 +2653,13 @@ EXPORT_SYMBOL(__printk_ratelimit); bool printk_timed_ratelimit(unsigned long *caller_jiffies, unsigned int interval_msecs) { - if (*caller_jiffies == 0 - || !time_in_range(jiffies, *caller_jiffies, - *caller_jiffies - + msecs_to_jiffies(interval_msecs))) { - *caller_jiffies = jiffies; - return true; - } - return false; + unsigned long elapsed = jiffies - *caller_jiffies; + + if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs)) + return false; + + *caller_jiffies = jiffies; + return true; } EXPORT_SYMBOL(printk_timed_ratelimit); diff --git a/kernel/resource.c b/kernel/resource.c index 3c2237ac32db..da14b8d09296 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock); static struct resource *bootmem_resource_free; static DEFINE_SPINLOCK(bootmem_resource_lock); -static void *r_next(struct seq_file *m, void *v, loff_t *pos) +static struct resource *next_resource(struct resource *p, bool sibling_only) { - struct resource *p = v; - (*pos)++; + /* Caller wants to traverse through siblings only */ + if (sibling_only) + return p->sibling; + if (p->child) return p->child; while (!p->sibling && p->parent) @@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos) return p->sibling; } +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + return (void *)next_resource(p, false); +} + #ifdef CONFIG_PROC_FS enum { MAX_IORES_LEVEL = 5 }; @@ -322,16 +331,19 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); -#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * Finds the lowest memory reosurce exists within [res->start.res->end) + * Finds the lowest iomem reosurce exists with-in [res->start.res->end) * the caller must specify res->start, res->end, res->flags and "name". * If found, returns 0, res is overwritten, if not found, returns -1. + * This walks through whole tree and not just first level children + * until and unless first_level_children_only is true. */ -static int find_next_system_ram(struct resource *res, char *name) +static int find_next_iomem_res(struct resource *res, char *name, + bool first_level_children_only) { resource_size_t start, end; struct resource *p; + bool sibling_only = false; BUG_ON(!res); @@ -340,8 +352,14 @@ static int find_next_system_ram(struct resource *res, char *name) BUG_ON(start >= end); read_lock(&resource_lock); - for (p = iomem_resource.child; p ; p = p->sibling) { - /* system ram is just marked as IORESOURCE_MEM */ + + if (first_level_children_only) { + p = iomem_resource.child; + sibling_only = true; + } else + p = &iomem_resource; + + while ((p = next_resource(p, sibling_only))) { if (p->flags != res->flags) continue; if (name && strcmp(p->name, name)) @@ -353,6 +371,7 @@ static int find_next_system_ram(struct resource *res, char *name) if ((p->end >= start) && (p->start < end)) break; } + read_unlock(&resource_lock); if (!p) return -1; @@ -365,6 +384,70 @@ static int find_next_system_ram(struct resource *res, char *name) } /* + * Walks through iomem resources and calls func() with matching resource + * ranges. This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * name are valid candidates. + * + * @name: name of resource + * @flags: resource flags + * @start: start addr + * @end: end addr + */ +int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, + void *arg, int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = flags; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, name, false))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". This function deals with + * full ranges and not pfn. If resources are not pfn aligned, dealing + * with pfn can truncate ranges. + */ +int walk_system_ram_res(u64 start, u64 end, void *arg, + int (*func)(u64, u64, void *)) +{ + struct resource res; + u64 orig_end; + int ret = -1; + + res.start = start; + res.end = end; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (!find_next_iomem_res(&res, "System RAM", true))) { + ret = (*func)(res.start, res.end, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) + +/* * This function calls callback against all memory range of "System RAM" * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. * Now, this function is only for "System RAM". @@ -382,7 +465,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_system_ram(&res, "System RAM") >= 0)) { + (find_next_iomem_res(&res, "System RAM", true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 36441b51b5df..51ea89f3089c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -25,6 +25,7 @@ cond_syscall(sys_swapon); cond_syscall(sys_swapoff); cond_syscall(sys_kexec_load); cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_kexec_file_load); cond_syscall(sys_init_module); cond_syscall(sys_finit_module); cond_syscall(sys_delete_module); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 75b22e22a72c..75875a741b5e 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1240,8 +1240,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #ifdef CONFIG_NUMA { @@ -1250,8 +1249,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { @@ -1274,8 +1272,7 @@ static struct ctl_table vm_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = hugetlb_overcommit_handler, - .extra1 = (void *)&hugetlb_zero, - .extra2 = (void *)&hugetlb_infinity, + .extra1 = &zero, }, #endif { diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 12d6ebbfdd83..0dbab6d1acb4 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -14,6 +14,8 @@ * the GNU General Public License for more details. */ +#define pr_fmt(fmt) "Kprobe smoke test: " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/random.h> @@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler\n"); + pr_err("incorrect value in post_handler\n"); } posth_val = preh_val + div_factor; } @@ -59,8 +60,7 @@ static int test_kprobe(void) ret = register_kprobe(&kp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobe returned %d\n", ret); + pr_err("register_kprobe returned %d\n", ret); return ret; } @@ -68,14 +68,12 @@ static int test_kprobe(void) unregister_kprobe(&kp); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, { if (preh_val != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in post_handler2\n"); + pr_err("incorrect value in post_handler2\n"); } posth_val = preh_val + div_factor; } @@ -120,8 +117,7 @@ static int test_kprobes(void) kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kprobes returned %d\n", ret); + pr_err("register_kprobes returned %d\n", ret); return ret; } @@ -130,14 +126,12 @@ static int test_kprobes(void) ret = target(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler not called\n"); + pr_err("kprobe pre_handler not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler not called\n"); + pr_err("kprobe post_handler not called\n"); handler_errors++; } @@ -146,14 +140,12 @@ static int test_kprobes(void) ret = target2(rand1); if (preh_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe pre_handler2 not called\n"); + pr_err("kprobe pre_handler2 not called\n"); handler_errors++; } if (posth_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kprobe post_handler2 not called\n"); + pr_err("kprobe post_handler2 not called\n"); handler_errors++; } @@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value) { if (value != rand1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in jprobe handler\n"); + pr_err("incorrect value in jprobe handler\n"); } jph_val = rand1; @@ -186,16 +177,14 @@ static int test_jprobe(void) ret = register_jprobe(&jp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobe returned %d\n", ret); + pr_err("register_jprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_jprobe(&jp); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } @@ -217,24 +206,21 @@ static int test_jprobes(void) jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_jprobes returned %d\n", ret); + pr_err("register_jprobes returned %d\n", ret); return ret; } jph_val = 0; ret = target(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler not called\n"); + pr_err("jprobe handler not called\n"); handler_errors++; } jph_val = 0; ret = target2(rand1); if (jph_val == 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "jprobe handler2 not called\n"); + pr_err("jprobe handler2 not called\n"); handler_errors++; } unregister_jprobes(jps, 2); @@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor)) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler\n"); + pr_err("incorrect value in kretprobe handler\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -281,16 +265,14 @@ static int test_kretprobe(void) ret = register_kretprobe(&rp); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } ret = target(rand1); unregister_kretprobe(&rp); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } @@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) if (ret != (rand1 / div_factor) + 1) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "incorrect value in kretprobe handler2\n"); + pr_err("incorrect value in kretprobe handler2\n"); } if (krph_val == 0) { handler_errors++; - printk(KERN_ERR "Kprobe smoke test failed: " - "call to kretprobe entry handler failed\n"); + pr_err("call to kretprobe entry handler failed\n"); } krph_val = rand1; @@ -332,24 +312,21 @@ static int test_kretprobes(void) rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { - printk(KERN_ERR "Kprobe smoke test failed: " - "register_kretprobe returned %d\n", ret); + pr_err("register_kretprobe returned %d\n", ret); return ret; } krph_val = 0; ret = target(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler not called\n"); + pr_err("kretprobe handler not called\n"); handler_errors++; } krph_val = 0; ret = target2(rand1); if (krph_val != rand1) { - printk(KERN_ERR "Kprobe smoke test failed: " - "kretprobe handler2 not called\n"); + pr_err("kretprobe handler2 not called\n"); handler_errors++; } unregister_kretprobes(rps, 2); @@ -368,7 +345,7 @@ int init_test_probes(void) rand1 = prandom_u32(); } while (rand1 <= div_factor); - printk(KERN_INFO "Kprobe smoke test started\n"); + pr_info("started\n"); num_tests++; ret = test_kprobe(); if (ret < 0) @@ -402,13 +379,11 @@ int init_test_probes(void) #endif /* CONFIG_KRETPROBES */ if (errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " - "%d tests failed\n", errors, num_tests); + pr_err("BUG: %d out of %d tests failed\n", errors, num_tests); else if (handler_errors) - printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " - "running handlers\n", handler_errors); + pr_err("BUG: %d error(s) running handlers\n", handler_errors); else - printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + pr_info("passed successfully\n"); return 0; } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index fcc02560fd6b..aa312b0dc3ec 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v) return; } -struct seq_operations proc_uid_seq_operations = { +const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; -struct seq_operations proc_gid_seq_operations = { +const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; -struct seq_operations proc_projid_seq_operations = { +const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index c3319bd1b040..a8d6914030fe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -260,9 +260,11 @@ static void watchdog_overflow_callback(struct perf_event *event, return; if (hardlockup_panic) - panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + panic("Watchdog detected hard LOCKUP on cpu %d", + this_cpu); else - WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", + this_cpu); __this_cpu_write(hard_watchdog_warn, true); return; @@ -345,7 +347,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) } } - printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", smp_processor_id(), duration, current->comm, task_pid_nr(current)); print_modules(); @@ -366,6 +368,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) smp_mb__after_atomic(); } + add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK); if (softlockup_panic) panic("softlockup: hung tasks"); __this_cpu_write(soft_watchdog_warn, true); @@ -484,7 +487,7 @@ static int watchdog_nmi_enable(unsigned int cpu) if (PTR_ERR(event) == -EOPNOTSUPP) pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); else if (PTR_ERR(event) == -ENOENT) - pr_warning("disabled (cpu%i): hardware events not enabled\n", + pr_warn("disabled (cpu%i): hardware events not enabled\n", cpu); else pr_err("disabled (cpu%i): unable to create perf event: %ld\n", diff --git a/lib/Kconfig b/lib/Kconfig index 334f7722a999..fdf90f3aa90d 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -177,6 +177,13 @@ config CRC8 when they need to do cyclic redundancy check according CRC8 algorithm. Module will be called crc8. +config CRC64_ECMA + tristate "CRC64 ECMA function" + help + This option provides CRC64 ECMA function. Drivers may select this + when they need to do cyclic redundancy check according to the CRC64 + ECMA algorithm. + config AUDIT_GENERIC bool depends on AUDIT && !AUDIT_ARCH @@ -396,6 +403,39 @@ config CPU_RMAP config DQL bool +config GLOB + bool +# This actually supports modular compilation, but the module overhead +# is ridiculous for the amount of code involved. Until an out-of-tree +# driver asks for it, we'll just link it directly it into the kernel +# when required. Since we're ignoring out-of-tree users, there's also +# no need bother prompting for a manual decision: +# prompt "glob_match() function" + help + This option provides a glob_match function for performing + simple text pattern matching. It originated in the ATA code + to blacklist particular drive models, but other device drivers + may need similar functionality. + + All drivers in the Linux kernel tree that require this function + should automatically select this option. Say N unless you + are compiling an out-of tree driver which tells you that it + depends on this. + +config GLOB_SELFTEST + bool "glob self-test on init" + default n + depends on GLOB + help + This option enables a simple self-test of the glob_match + function on startup. It is primarily useful for people + working on the code to ensure they haven't introduced any + regressions. + + It only adds a little bit of code and slows kernel boot (or + module load) by a small amount, so you're welcome to play with + it, but you probably don't need it. + # # Netlink attribute parsing support is select'ed if needed # @@ -474,4 +514,11 @@ config UCS2_STRING source "lib/fonts/Kconfig" +# +# sg chaining option +# + +config ARCH_HAS_SG_CHAIN + def_bool n + endmenu diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4fc8e743afdf..6b62a765a92f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -15,7 +15,7 @@ config PRINTK_TIME The behavior is also controlled by the kernel command line parameter printk.time=1. See Documentation/kernel-parameters.txt -config DEFAULT_MESSAGE_LOGLEVEL +config MESSAGE_LOGLEVEL_DEFAULT int "Default message log level (1-7)" range 1 7 default "4" diff --git a/lib/Makefile b/lib/Makefile index ba967a19edba..e48067c304d7 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -71,6 +71,7 @@ obj-$(CONFIG_CRC32) += crc32.o obj-$(CONFIG_CRC7) += crc7.o obj-$(CONFIG_LIBCRC32C) += libcrc32c.o obj-$(CONFIG_CRC8) += crc8.o +obj-$(CONFIG_CRC64_ECMA) += crc64_ecma.o obj-$(CONFIG_GENERIC_ALLOCATOR) += genalloc.o obj-$(CONFIG_ZLIB_INFLATE) += zlib_inflate/ @@ -136,6 +137,8 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_DQL) += dynamic_queue_limits.o +obj-$(CONFIG_GLOB) += glob.o + obj-$(CONFIG_MPILIB) += mpi/ obj-$(CONFIG_SIGNATURE) += digsig.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 06f7e4fe8d2d..1e031f2c9aba 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -40,9 +40,9 @@ * for the best explanations of this ordering. */ -int __bitmap_empty(const unsigned long *bitmap, int bits) +int __bitmap_empty(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap[k]) return 0; @@ -55,9 +55,9 @@ int __bitmap_empty(const unsigned long *bitmap, int bits) } EXPORT_SYMBOL(__bitmap_empty); -int __bitmap_full(const unsigned long *bitmap, int bits) +int __bitmap_full(const unsigned long *bitmap, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (~bitmap[k]) return 0; @@ -71,9 +71,9 @@ int __bitmap_full(const unsigned long *bitmap, int bits) EXPORT_SYMBOL(__bitmap_full); int __bitmap_equal(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] != bitmap2[k]) return 0; @@ -86,14 +86,14 @@ int __bitmap_equal(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_equal); -void __bitmap_complement(unsigned long *dst, const unsigned long *src, int bits) +void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) dst[k] = ~src[k]; if (bits % BITS_PER_LONG) - dst[k] = ~src[k] & BITMAP_LAST_WORD_MASK(bits); + dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); @@ -182,23 +182,26 @@ void __bitmap_shift_left(unsigned long *dst, EXPORT_SYMBOL(__bitmap_shift_left); int __bitmap_and(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_and); void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] | bitmap2[k]; @@ -206,10 +209,10 @@ void __bitmap_or(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_or); void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int nr = BITS_TO_LONGS(bits); for (k = 0; k < nr; k++) dst[k] = bitmap1[k] ^ bitmap2[k]; @@ -217,22 +220,25 @@ void __bitmap_xor(unsigned long *dst, const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_xor); int __bitmap_andnot(unsigned long *dst, const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k; - int nr = BITS_TO_LONGS(bits); + unsigned int k; + unsigned int lim = bits/BITS_PER_LONG; unsigned long result = 0; - for (k = 0; k < nr; k++) + for (k = 0; k < lim; k++) result |= (dst[k] = bitmap1[k] & ~bitmap2[k]); + if (bits % BITS_PER_LONG) + result |= (dst[k] = bitmap1[k] & ~bitmap2[k] & + BITMAP_LAST_WORD_MASK(bits)); return result != 0; } EXPORT_SYMBOL(__bitmap_andnot); int __bitmap_intersects(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & bitmap2[k]) return 1; @@ -245,9 +251,9 @@ int __bitmap_intersects(const unsigned long *bitmap1, EXPORT_SYMBOL(__bitmap_intersects); int __bitmap_subset(const unsigned long *bitmap1, - const unsigned long *bitmap2, int bits) + const unsigned long *bitmap2, unsigned int bits) { - int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; for (k = 0; k < lim; ++k) if (bitmap1[k] & ~bitmap2[k]) return 0; @@ -259,9 +265,10 @@ int __bitmap_subset(const unsigned long *bitmap1, } EXPORT_SYMBOL(__bitmap_subset); -int __bitmap_weight(const unsigned long *bitmap, int bits) +int __bitmap_weight(const unsigned long *bitmap, unsigned int bits) { - int k, w = 0, lim = bits/BITS_PER_LONG; + unsigned int k, lim = bits/BITS_PER_LONG; + int w = 0; for (k = 0; k < lim; k++) w += hweight_long(bitmap[k]); @@ -273,42 +280,42 @@ int __bitmap_weight(const unsigned long *bitmap, int bits) } EXPORT_SYMBOL(__bitmap_weight); -void bitmap_set(unsigned long *map, int start, int nr) +void bitmap_set(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_set = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_set = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_set >= 0) { + while (len - bits_to_set >= 0) { *p |= mask_to_set; - nr -= bits_to_set; + len -= bits_to_set; bits_to_set = BITS_PER_LONG; mask_to_set = ~0UL; p++; } - if (nr) { + if (len) { mask_to_set &= BITMAP_LAST_WORD_MASK(size); *p |= mask_to_set; } } EXPORT_SYMBOL(bitmap_set); -void bitmap_clear(unsigned long *map, int start, int nr) +void bitmap_clear(unsigned long *map, unsigned int start, int len) { unsigned long *p = map + BIT_WORD(start); - const int size = start + nr; + const unsigned int size = start + len; int bits_to_clear = BITS_PER_LONG - (start % BITS_PER_LONG); unsigned long mask_to_clear = BITMAP_FIRST_WORD_MASK(start); - while (nr - bits_to_clear >= 0) { + while (len - bits_to_clear >= 0) { *p &= ~mask_to_clear; - nr -= bits_to_clear; + len -= bits_to_clear; bits_to_clear = BITS_PER_LONG; mask_to_clear = ~0UL; p++; } - if (nr) { + if (len) { mask_to_clear &= BITMAP_LAST_WORD_MASK(size); *p &= ~mask_to_clear; } @@ -664,13 +671,8 @@ static int __bitmap_parselist(const char *buf, unsigned int buflen, int bitmap_parselist(const char *bp, unsigned long *maskp, int nmaskbits) { - char *nl = strchr(bp, '\n'); - int len; - - if (nl) - len = nl - bp; - else - len = strlen(bp); + char *nl = strchrnul(bp, '\n'); + int len = nl - bp; return __bitmap_parselist(bp, len, 0, maskp, nmaskbits); } @@ -716,7 +718,7 @@ EXPORT_SYMBOL(bitmap_parselist_user); * * If for example, just bits 4 through 7 are set in @buf, then @pos * values 4 through 7 will get mapped to 0 through 3, respectively, - * and other @pos values will get mapped to 0. When @pos value 7 + * and other @pos values will get mapped to -1. When @pos value 7 * gets mapped to (returns) @ord value 3 in this example, that means * that bit 7 is the 3rd (starting with 0th) set bit in @buf. * @@ -1046,7 +1048,7 @@ enum { REG_OP_RELEASE, /* clear all bits in region */ }; -static int __reg_op(unsigned long *bitmap, int pos, int order, int reg_op) +static int __reg_op(unsigned long *bitmap, unsigned int pos, int order, int reg_op) { int nbits_reg; /* number of bits in region */ int index; /* index first long of region in bitmap */ @@ -1112,11 +1114,11 @@ done: * Return the bit offset in bitmap of the allocated region, * or -errno on failure. */ -int bitmap_find_free_region(unsigned long *bitmap, int bits, int order) +int bitmap_find_free_region(unsigned long *bitmap, unsigned int bits, int order) { - int pos, end; /* scans bitmap by regions of size order */ + unsigned int pos, end; /* scans bitmap by regions of size order */ - for (pos = 0 ; (end = pos + (1 << order)) <= bits; pos = end) { + for (pos = 0 ; (end = pos + (1U << order)) <= bits; pos = end) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) continue; __reg_op(bitmap, pos, order, REG_OP_ALLOC); @@ -1137,7 +1139,7 @@ EXPORT_SYMBOL(bitmap_find_free_region); * * No return value. */ -void bitmap_release_region(unsigned long *bitmap, int pos, int order) +void bitmap_release_region(unsigned long *bitmap, unsigned int pos, int order) { __reg_op(bitmap, pos, order, REG_OP_RELEASE); } @@ -1154,12 +1156,11 @@ EXPORT_SYMBOL(bitmap_release_region); * Return 0 on success, or %-EBUSY if specified region wasn't * free (not all bits were zero). */ -int bitmap_allocate_region(unsigned long *bitmap, int pos, int order) +int bitmap_allocate_region(unsigned long *bitmap, unsigned int pos, int order) { if (!__reg_op(bitmap, pos, order, REG_OP_ISFREE)) return -EBUSY; - __reg_op(bitmap, pos, order, REG_OP_ALLOC); - return 0; + return __reg_op(bitmap, pos, order, REG_OP_ALLOC); } EXPORT_SYMBOL(bitmap_allocate_region); diff --git a/lib/cmdline.c b/lib/cmdline.c index d4932f745e92..76a712e6e20e 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -121,11 +121,7 @@ EXPORT_SYMBOL(get_options); * @retptr: (output) Optional pointer to next char after parse completes * * Parses a string into a number. The number stored at @ptr is - * potentially suffixed with %K (for kilobytes, or 1024 bytes), - * %M (for megabytes, or 1048576 bytes), or %G (for gigabytes, or - * 1073741824). If the number is suffixed with K, M, or G, then - * the return value is the number multiplied by one kilobyte, one - * megabyte, or one gigabyte, respectively. + * potentially suffixed with K, M, G, T, P, E. */ unsigned long long memparse(const char *ptr, char **retptr) @@ -135,6 +131,15 @@ unsigned long long memparse(const char *ptr, char **retptr) unsigned long long ret = simple_strtoull(ptr, &endptr, 0); switch (*endptr) { + case 'E': + case 'e': + ret <<= 10; + case 'P': + case 'p': + ret <<= 10; + case 'T': + case 't': + ret <<= 10; case 'G': case 'g': ret <<= 10; diff --git a/lib/crc64_ecma.c b/lib/crc64_ecma.c new file mode 100644 index 000000000000..41629ea5a60c --- /dev/null +++ b/lib/crc64_ecma.c @@ -0,0 +1,341 @@ +/* + * Copyright 2013 Freescale Semiconductor Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Freescale Semiconductor nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * + * ALTERNATIVELY, this software may be distributed under the terms of the + * GNU General Public License ("GPL") as published by the Free Software + * Foundation, either version 2 of that License or (at your option) any + * later version. + * + * THIS SOFTWARE IS PROVIDED BY Freescale Semiconductor ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL Freescale Semiconductor BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <linux/module.h> +#include <linux/crc64_ecma.h> + + +#define CRC64_BYTE_MASK 0xFF +#define CRC64_TABLE_SIZE 256 + + +struct crc64_table { + u64 seed; + u64 table[CRC64_TABLE_SIZE]; +}; + + +static struct crc64_table CRC64_ECMA_182 = { + CRC64_DEFAULT_INITVAL, + { + 0x0000000000000000ULL, + 0xb32e4cbe03a75f6fULL, + 0xf4843657a840a05bULL, + 0x47aa7ae9abe7ff34ULL, + 0x7bd0c384ff8f5e33ULL, + 0xc8fe8f3afc28015cULL, + 0x8f54f5d357cffe68ULL, + 0x3c7ab96d5468a107ULL, + 0xf7a18709ff1ebc66ULL, + 0x448fcbb7fcb9e309ULL, + 0x0325b15e575e1c3dULL, + 0xb00bfde054f94352ULL, + 0x8c71448d0091e255ULL, + 0x3f5f08330336bd3aULL, + 0x78f572daa8d1420eULL, + 0xcbdb3e64ab761d61ULL, + 0x7d9ba13851336649ULL, + 0xceb5ed8652943926ULL, + 0x891f976ff973c612ULL, + 0x3a31dbd1fad4997dULL, + 0x064b62bcaebc387aULL, + 0xb5652e02ad1b6715ULL, + 0xf2cf54eb06fc9821ULL, + 0x41e11855055bc74eULL, + 0x8a3a2631ae2dda2fULL, + 0x39146a8fad8a8540ULL, + 0x7ebe1066066d7a74ULL, + 0xcd905cd805ca251bULL, + 0xf1eae5b551a2841cULL, + 0x42c4a90b5205db73ULL, + 0x056ed3e2f9e22447ULL, + 0xb6409f5cfa457b28ULL, + 0xfb374270a266cc92ULL, + 0x48190ecea1c193fdULL, + 0x0fb374270a266cc9ULL, + 0xbc9d3899098133a6ULL, + 0x80e781f45de992a1ULL, + 0x33c9cd4a5e4ecdceULL, + 0x7463b7a3f5a932faULL, + 0xc74dfb1df60e6d95ULL, + 0x0c96c5795d7870f4ULL, + 0xbfb889c75edf2f9bULL, + 0xf812f32ef538d0afULL, + 0x4b3cbf90f69f8fc0ULL, + 0x774606fda2f72ec7ULL, + 0xc4684a43a15071a8ULL, + 0x83c230aa0ab78e9cULL, + 0x30ec7c140910d1f3ULL, + 0x86ace348f355aadbULL, + 0x3582aff6f0f2f5b4ULL, + 0x7228d51f5b150a80ULL, + 0xc10699a158b255efULL, + 0xfd7c20cc0cdaf4e8ULL, + 0x4e526c720f7dab87ULL, + 0x09f8169ba49a54b3ULL, + 0xbad65a25a73d0bdcULL, + 0x710d64410c4b16bdULL, + 0xc22328ff0fec49d2ULL, + 0x85895216a40bb6e6ULL, + 0x36a71ea8a7ace989ULL, + 0x0adda7c5f3c4488eULL, + 0xb9f3eb7bf06317e1ULL, + 0xfe5991925b84e8d5ULL, + 0x4d77dd2c5823b7baULL, + 0x64b62bcaebc387a1ULL, + 0xd7986774e864d8ceULL, + 0x90321d9d438327faULL, + 0x231c512340247895ULL, + 0x1f66e84e144cd992ULL, + 0xac48a4f017eb86fdULL, + 0xebe2de19bc0c79c9ULL, + 0x58cc92a7bfab26a6ULL, + 0x9317acc314dd3bc7ULL, + 0x2039e07d177a64a8ULL, + 0x67939a94bc9d9b9cULL, + 0xd4bdd62abf3ac4f3ULL, + 0xe8c76f47eb5265f4ULL, + 0x5be923f9e8f53a9bULL, + 0x1c4359104312c5afULL, + 0xaf6d15ae40b59ac0ULL, + 0x192d8af2baf0e1e8ULL, + 0xaa03c64cb957be87ULL, + 0xeda9bca512b041b3ULL, + 0x5e87f01b11171edcULL, + 0x62fd4976457fbfdbULL, + 0xd1d305c846d8e0b4ULL, + 0x96797f21ed3f1f80ULL, + 0x2557339fee9840efULL, + 0xee8c0dfb45ee5d8eULL, + 0x5da24145464902e1ULL, + 0x1a083bacedaefdd5ULL, + 0xa9267712ee09a2baULL, + 0x955cce7fba6103bdULL, + 0x267282c1b9c65cd2ULL, + 0x61d8f8281221a3e6ULL, + 0xd2f6b4961186fc89ULL, + 0x9f8169ba49a54b33ULL, + 0x2caf25044a02145cULL, + 0x6b055fede1e5eb68ULL, + 0xd82b1353e242b407ULL, + 0xe451aa3eb62a1500ULL, + 0x577fe680b58d4a6fULL, + 0x10d59c691e6ab55bULL, + 0xa3fbd0d71dcdea34ULL, + 0x6820eeb3b6bbf755ULL, + 0xdb0ea20db51ca83aULL, + 0x9ca4d8e41efb570eULL, + 0x2f8a945a1d5c0861ULL, + 0x13f02d374934a966ULL, + 0xa0de61894a93f609ULL, + 0xe7741b60e174093dULL, + 0x545a57dee2d35652ULL, + 0xe21ac88218962d7aULL, + 0x5134843c1b317215ULL, + 0x169efed5b0d68d21ULL, + 0xa5b0b26bb371d24eULL, + 0x99ca0b06e7197349ULL, + 0x2ae447b8e4be2c26ULL, + 0x6d4e3d514f59d312ULL, + 0xde6071ef4cfe8c7dULL, + 0x15bb4f8be788911cULL, + 0xa6950335e42fce73ULL, + 0xe13f79dc4fc83147ULL, + 0x521135624c6f6e28ULL, + 0x6e6b8c0f1807cf2fULL, + 0xdd45c0b11ba09040ULL, + 0x9aefba58b0476f74ULL, + 0x29c1f6e6b3e0301bULL, + 0xc96c5795d7870f42ULL, + 0x7a421b2bd420502dULL, + 0x3de861c27fc7af19ULL, + 0x8ec62d7c7c60f076ULL, + 0xb2bc941128085171ULL, + 0x0192d8af2baf0e1eULL, + 0x4638a2468048f12aULL, + 0xf516eef883efae45ULL, + 0x3ecdd09c2899b324ULL, + 0x8de39c222b3eec4bULL, + 0xca49e6cb80d9137fULL, + 0x7967aa75837e4c10ULL, + 0x451d1318d716ed17ULL, + 0xf6335fa6d4b1b278ULL, + 0xb199254f7f564d4cULL, + 0x02b769f17cf11223ULL, + 0xb4f7f6ad86b4690bULL, + 0x07d9ba1385133664ULL, + 0x4073c0fa2ef4c950ULL, + 0xf35d8c442d53963fULL, + 0xcf273529793b3738ULL, + 0x7c0979977a9c6857ULL, + 0x3ba3037ed17b9763ULL, + 0x888d4fc0d2dcc80cULL, + 0x435671a479aad56dULL, + 0xf0783d1a7a0d8a02ULL, + 0xb7d247f3d1ea7536ULL, + 0x04fc0b4dd24d2a59ULL, + 0x3886b22086258b5eULL, + 0x8ba8fe9e8582d431ULL, + 0xcc0284772e652b05ULL, + 0x7f2cc8c92dc2746aULL, + 0x325b15e575e1c3d0ULL, + 0x8175595b76469cbfULL, + 0xc6df23b2dda1638bULL, + 0x75f16f0cde063ce4ULL, + 0x498bd6618a6e9de3ULL, + 0xfaa59adf89c9c28cULL, + 0xbd0fe036222e3db8ULL, + 0x0e21ac88218962d7ULL, + 0xc5fa92ec8aff7fb6ULL, + 0x76d4de52895820d9ULL, + 0x317ea4bb22bfdfedULL, + 0x8250e80521188082ULL, + 0xbe2a516875702185ULL, + 0x0d041dd676d77eeaULL, + 0x4aae673fdd3081deULL, + 0xf9802b81de97deb1ULL, + 0x4fc0b4dd24d2a599ULL, + 0xfceef8632775faf6ULL, + 0xbb44828a8c9205c2ULL, + 0x086ace348f355aadULL, + 0x34107759db5dfbaaULL, + 0x873e3be7d8faa4c5ULL, + 0xc094410e731d5bf1ULL, + 0x73ba0db070ba049eULL, + 0xb86133d4dbcc19ffULL, + 0x0b4f7f6ad86b4690ULL, + 0x4ce50583738cb9a4ULL, + 0xffcb493d702be6cbULL, + 0xc3b1f050244347ccULL, + 0x709fbcee27e418a3ULL, + 0x3735c6078c03e797ULL, + 0x841b8ab98fa4b8f8ULL, + 0xadda7c5f3c4488e3ULL, + 0x1ef430e13fe3d78cULL, + 0x595e4a08940428b8ULL, + 0xea7006b697a377d7ULL, + 0xd60abfdbc3cbd6d0ULL, + 0x6524f365c06c89bfULL, + 0x228e898c6b8b768bULL, + 0x91a0c532682c29e4ULL, + 0x5a7bfb56c35a3485ULL, + 0xe955b7e8c0fd6beaULL, + 0xaeffcd016b1a94deULL, + 0x1dd181bf68bdcbb1ULL, + 0x21ab38d23cd56ab6ULL, + 0x9285746c3f7235d9ULL, + 0xd52f0e859495caedULL, + 0x6601423b97329582ULL, + 0xd041dd676d77eeaaULL, + 0x636f91d96ed0b1c5ULL, + 0x24c5eb30c5374ef1ULL, + 0x97eba78ec690119eULL, + 0xab911ee392f8b099ULL, + 0x18bf525d915feff6ULL, + 0x5f1528b43ab810c2ULL, + 0xec3b640a391f4fadULL, + 0x27e05a6e926952ccULL, + 0x94ce16d091ce0da3ULL, + 0xd3646c393a29f297ULL, + 0x604a2087398eadf8ULL, + 0x5c3099ea6de60cffULL, + 0xef1ed5546e415390ULL, + 0xa8b4afbdc5a6aca4ULL, + 0x1b9ae303c601f3cbULL, + 0x56ed3e2f9e224471ULL, + 0xe5c372919d851b1eULL, + 0xa26908783662e42aULL, + 0x114744c635c5bb45ULL, + 0x2d3dfdab61ad1a42ULL, + 0x9e13b115620a452dULL, + 0xd9b9cbfcc9edba19ULL, + 0x6a978742ca4ae576ULL, + 0xa14cb926613cf817ULL, + 0x1262f598629ba778ULL, + 0x55c88f71c97c584cULL, + 0xe6e6c3cfcadb0723ULL, + 0xda9c7aa29eb3a624ULL, + 0x69b2361c9d14f94bULL, + 0x2e184cf536f3067fULL, + 0x9d36004b35545910ULL, + 0x2b769f17cf112238ULL, + 0x9858d3a9ccb67d57ULL, + 0xdff2a94067518263ULL, + 0x6cdce5fe64f6dd0cULL, + 0x50a65c93309e7c0bULL, + 0xe388102d33392364ULL, + 0xa4226ac498dedc50ULL, + 0x170c267a9b79833fULL, + 0xdcd7181e300f9e5eULL, + 0x6ff954a033a8c131ULL, + 0x28532e49984f3e05ULL, + 0x9b7d62f79be8616aULL, + 0xa707db9acf80c06dULL, + 0x14299724cc279f02ULL, + 0x5383edcd67c06036ULL, + 0xe0ada17364673f59ULL + } +}; + + +/* + * crc64_ecma_seed - Initializes the CRC64 ECMA seed. + */ +u64 crc64_ecma_seed(void) +{ + return CRC64_ECMA_182.seed; +} +EXPORT_SYMBOL(crc64_ecma_seed); + +/* + * crc64_ecma - Computes the 64 bit ECMA CRC. + * + * pdata: pointer to the data to compute checksum for. + * nbytes: number of bytes in data buffer. + * seed: CRC seed. + */ +u64 crc64_ecma(u8 const *pdata, u32 nbytes, u64 seed) +{ + unsigned int i; + u64 crc = seed; + + for (i = 0; i < nbytes; i++) + crc = CRC64_ECMA_182.table[(crc ^ pdata[i]) & CRC64_BYTE_MASK] ^ + (crc >> 8); + + return crc; +} +EXPORT_SYMBOL(crc64_ecma); + +MODULE_DESCRIPTION("CRC64 ECMA function"); +MODULE_AUTHOR("Freescale Semiconductor Inc."); +MODULE_LICENSE("GPL"); diff --git a/lib/decompress.c b/lib/decompress.c index 86069d74c062..37f3c786348f 100644 --- a/lib/decompress.c +++ b/lib/decompress.c @@ -54,7 +54,7 @@ static const struct compress_format compressed_formats[] __initconst = { { {0, 0}, NULL, NULL } }; -decompress_fn __init decompress_method(const unsigned char *inbuf, int len, +decompress_fn __init decompress_method(const unsigned char *inbuf, long len, const char **name) { const struct compress_format *cf; diff --git a/lib/decompress_bunzip2.c b/lib/decompress_bunzip2.c index 31c5f7675fbf..8290e0bef7ea 100644 --- a/lib/decompress_bunzip2.c +++ b/lib/decompress_bunzip2.c @@ -92,8 +92,8 @@ struct bunzip_data { /* State for interrupting output loop */ int writeCopies, writePos, writeRunCountdown, writeCount, writeCurrent; /* I/O tracking data (file handles, buffers, positions, etc.) */ - int (*fill)(void*, unsigned int); - int inbufCount, inbufPos /*, outbufPos*/; + long (*fill)(void*, unsigned long); + long inbufCount, inbufPos /*, outbufPos*/; unsigned char *inbuf /*,*outbuf*/; unsigned int inbufBitCount, inbufBits; /* The CRC values stored in the block header and calculated from the @@ -617,7 +617,7 @@ decode_next_byte: goto decode_next_byte; } -static int INIT nofill(void *buf, unsigned int len) +static long INIT nofill(void *buf, unsigned long len) { return -1; } @@ -625,8 +625,8 @@ static int INIT nofill(void *buf, unsigned int len) /* Allocate the structure, read file header. If in_fd ==-1, inbuf must contain a complete bunzip file (len bytes long). If in_fd!=-1, inbuf and len are ignored, and data is read from file handle into temporary buffer. */ -static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, - int (*fill)(void*, unsigned int)) +static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, long len, + long (*fill)(void*, unsigned long)) { struct bunzip_data *bd; unsigned int i, j, c; @@ -675,11 +675,11 @@ static int INIT start_bunzip(struct bunzip_data **bdp, void *inbuf, int len, /* Example usage: decompress src_fd to dst_fd. (Stops at end of bzip2 data, not end of file.) */ -STATIC int INIT bunzip2(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT bunzip2(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *pos, + long *pos, void(*error)(char *x)) { struct bunzip_data *bd; @@ -743,11 +743,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *outbuf, - int *pos, + long *pos, void(*error)(char *x)) { return bunzip2(buf, len - 4, fill, flush, outbuf, pos, error); diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index 0edfd742a154..d4c7891635ec 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -27,17 +27,17 @@ #define GZIP_IOBUF_SIZE (16*1024) -static int INIT nofill(void *buffer, unsigned int len) +static long INIT nofill(void *buffer, unsigned long len) { return -1; } /* Included from initramfs et al code */ -STATIC int INIT gunzip(unsigned char *buf, int len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT gunzip(unsigned char *buf, long len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *out_buf, - int *pos, + long *pos, void(*error)(char *x)) { u8 *zbuf; struct z_stream_s *strm; @@ -142,7 +142,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, /* Write any data generated */ if (flush && strm->next_out > out_buf) { - int l = strm->next_out - out_buf; + long l = strm->next_out - out_buf; if (l != flush(out_buf, l)) { rc = -1; error("write error"); diff --git a/lib/decompress_unlz4.c b/lib/decompress_unlz4.c index 7d1e83caf8ad..40f66ebe57b7 100644 --- a/lib/decompress_unlz4.c +++ b/lib/decompress_unlz4.c @@ -31,10 +31,10 @@ #define LZ4_DEFAULT_UNCOMPRESSED_CHUNK_SIZE (8 << 20) #define ARCHIVE_MAGICNUMBER 0x184C2102 -STATIC inline int INIT unlz4(u8 *input, int in_len, - int (*fill) (void *, unsigned int), - int (*flush) (void *, unsigned int), - u8 *output, int *posp, +STATIC inline int INIT unlz4(u8 *input, long in_len, + long (*fill)(void *, unsigned long), + long (*flush)(void *, unsigned long), + u8 *output, long *posp, void (*error) (char *x)) { int ret = -1; @@ -43,7 +43,7 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, u8 *inp; u8 *inp_start; u8 *outp; - int size = in_len; + long size = in_len; #ifdef PREBOOT size_t out_len = get_unaligned_le32(input + in_len); #endif @@ -83,13 +83,20 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, if (posp) *posp = 0; - if (fill) - fill(inp, 4); + if (fill) { + size = fill(inp, 4); + if (size < 4) { + error("data corrupted"); + goto exit_2; + } + } chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; + if (!fill) { + inp += 4; + size -= 4; + } } else { error("invalid header"); goto exit_2; @@ -100,29 +107,44 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, for (;;) { - if (fill) - fill(inp, 4); + if (fill) { + size = fill(inp, 4); + if (size == 0) + break; + if (size < 4) { + error("data corrupted"); + goto exit_2; + } + } chunksize = get_unaligned_le32(inp); if (chunksize == ARCHIVE_MAGICNUMBER) { - inp += 4; - size -= 4; + if (!fill) { + inp += 4; + size -= 4; + } if (posp) *posp += 4; continue; } - inp += 4; - size -= 4; + if (posp) *posp += 4; - if (fill) { + if (!fill) { + inp += 4; + size -= 4; + } else { if (chunksize > lz4_compressbound(uncomp_chunksize)) { error("chunk length is longer than allocated"); goto exit_2; } - fill(inp, chunksize); + size = fill(inp, chunksize); + if (size < chunksize) { + error("data corrupted"); + goto exit_2; + } } #ifdef PREBOOT if (out_len >= uncomp_chunksize) { @@ -149,18 +171,17 @@ STATIC inline int INIT unlz4(u8 *input, int in_len, if (posp) *posp += chunksize; - size -= chunksize; + if (!fill) { + size -= chunksize; - if (size == 0) - break; - else if (size < 0) { - error("data corrupted"); - goto exit_2; + if (size == 0) + break; + else if (size < 0) { + error("data corrupted"); + goto exit_2; + } + inp += chunksize; } - - inp += chunksize; - if (fill) - inp = inp_start; } ret = 0; @@ -175,11 +196,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { diff --git a/lib/decompress_unlzma.c b/lib/decompress_unlzma.c index 32adb73a9038..0be83af62b88 100644 --- a/lib/decompress_unlzma.c +++ b/lib/decompress_unlzma.c @@ -65,11 +65,11 @@ static long long INIT read_int(unsigned char *ptr, int size) #define LZMA_IOBUF_SIZE 0x10000 struct rc { - int (*fill)(void*, unsigned int); + long (*fill)(void*, unsigned long); uint8_t *ptr; uint8_t *buffer; uint8_t *buffer_end; - int buffer_size; + long buffer_size; uint32_t code; uint32_t range; uint32_t bound; @@ -82,7 +82,7 @@ struct rc { #define RC_MODEL_TOTAL_BITS 11 -static int INIT nofill(void *buffer, unsigned int len) +static long INIT nofill(void *buffer, unsigned long len) { return -1; } @@ -99,8 +99,8 @@ static void INIT rc_read(struct rc *rc) /* Called once */ static inline void INIT rc_init(struct rc *rc, - int (*fill)(void*, unsigned int), - char *buffer, int buffer_size) + long (*fill)(void*, unsigned long), + char *buffer, long buffer_size) { if (fill) rc->fill = fill; @@ -280,7 +280,7 @@ struct writer { size_t buffer_pos; int bufsize; size_t global_pos; - int(*flush)(void*, unsigned int); + long (*flush)(void*, unsigned long); struct lzma_header *header; }; @@ -534,11 +534,11 @@ static inline int INIT process_bit1(struct writer *wr, struct rc *rc, -STATIC inline int INIT unlzma(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC inline int INIT unlzma(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { @@ -667,11 +667,11 @@ exit_0: } #ifdef PREBOOT -STATIC int INIT decompress(unsigned char *buf, int in_len, - int(*fill)(void*, unsigned int), - int(*flush)(void*, unsigned int), +STATIC int INIT decompress(unsigned char *buf, long in_len, + long (*fill)(void*, unsigned long), + long (*flush)(void*, unsigned long), unsigned char *output, - int *posp, + long *posp, void(*error)(char *x) ) { diff --git a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c index 960183d4258f..b94a31bdd87d 100644 --- a/lib/decompress_unlzo.c +++ b/lib/decompress_unlzo.c @@ -51,7 +51,7 @@ static const unsigned char lzop_magic[] = { #define HEADER_SIZE_MIN (9 + 7 + 4 + 8 + 1 + 4) #define HEADER_SIZE_MAX (9 + 7 + 1 + 8 + 8 + 4 + 1 + 255 + 4) -STATIC inline int INIT parse_header(u8 *input, int *skip, int in_len) +STATIC inline long INIT parse_header(u8 *input, long *skip, long in_len) { int l; u8 *parse = input; @@ -108,14 +108,14 @@ STATIC inline int INIT parse_header(u8 *input, int *skip, int in_len) return 1; } -STATIC inline int INIT unlzo(u8 *input, int in_len, - int (*fill) (void *, unsigned int), - int (*flush) (void *, unsigned int), - u8 *output, int *posp, +STATIC int INIT unlzo(u8 *input, long in_len, + long (*fill)(void *, unsigned long), + long (*flush)(void *, unsigned long), + u8 *output, long *posp, void (*error) (char *x)) { u8 r = 0; - int skip = 0; + long skip = 0; u32 src_len, dst_len; size_t tmp; u8 *in_buf, *in_buf_save, *out_buf; diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 9f34eb56854d..b07a78340e9d 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -248,10 +248,10 @@ void *memmove(void *dest, const void *src, size_t size) * both input and output buffers are available as a single chunk, i.e. when * fill() and flush() won't be used. */ -STATIC int INIT unxz(unsigned char *in, int in_size, - int (*fill)(void *dest, unsigned int size), - int (*flush)(void *src, unsigned int size), - unsigned char *out, int *in_used, +STATIC int INIT unxz(unsigned char *in, long in_size, + long (*fill)(void *dest, unsigned long size), + long (*flush)(void *src, unsigned long size), + unsigned char *out, long *in_used, void (*error)(char *x)) { struct xz_buf b; @@ -329,7 +329,7 @@ STATIC int INIT unxz(unsigned char *in, int in_size, * returned by xz_dec_run(), but probably * it's not too bad. */ - if (flush(b.out, b.out_pos) != (int)b.out_pos) + if (flush(b.out, b.out_pos) != (long)b.out_pos) ret = XZ_BUF_ERROR; b.out_pos = 0; diff --git a/lib/glob.c b/lib/glob.c new file mode 100644 index 000000000000..500fc80d23e1 --- /dev/null +++ b/lib/glob.c @@ -0,0 +1,287 @@ +#include <linux/module.h> +#include <linux/glob.h> + +/* + * The only reason this code can be compiled as a module is because the + * ATA code that depends on it can be as well. In practice, they're + * both usually compiled in and the module overhead goes away. + */ +MODULE_DESCRIPTION("glob(7) matching"); +MODULE_LICENSE("Dual MIT/GPL"); + +/** + * glob_match - Shell-style pattern matching, like !fnmatch(pat, str, 0) + * @pat: Shell-style pattern to match, e.g. "*.[ch]". + * @str: String to match. The pattern must match the entire string. + * + * Perform shell-style glob matching, returning true (1) if the match + * succeeds, or false (0) if it fails. Equivalent to !fnmatch(@pat, @str, 0). + * + * Pattern metacharacters are ?, *, [ and \. + * (And, inside character classes, !, - and ].) + * + * This is small and simple implementation intended for device blacklists + * where a string is matched against a number of patterns. Thus, it + * does not preprocess the patterns. It is non-recursive, and run-time + * is at most quadratic: strlen(@str)*strlen(@pat). + * + * An example of the worst case is glob_match("*aaaaa", "aaaaaaaaaa"); + * it takes 6 passes over the pattern before matching the string. + * + * Like !fnmatch(@pat, @str, 0) and unlike the shell, this does NOT + * treat / or leading . specially; it isn't actually used for pathnames. + * + * Note that according to glob(7) (and unlike bash), character classes + * are complemented by a leading !; this does not support the regex-style + * [^a-z] syntax. + * + * An opening bracket without a matching close is matched literally. + */ +bool __pure glob_match(char const *pat, char const *str) +{ + /* + * Backtrack to previous * on mismatch and retry starting one + * character later in the string. Because * matches all characters + * (no exception for /), it can be easily proved that there's + * never a need to backtrack multiple levels. + */ + char const *back_pat = NULL, *back_str = back_str; + + /* + * Loop over each token (character or class) in pat, matching + * it against the remaining unmatched tail of str. Return false + * on mismatch, or true after matching the trailing nul bytes. + */ + for (;;) { + unsigned char c = *str++; + unsigned char d = *pat++; + + switch (d) { + case '?': /* Wildcard: anything but nul */ + if (c == '\0') + return false; + break; + case '*': /* Any-length wildcard */ + if (*pat == '\0') /* Optimize trailing * case */ + return true; + back_pat = pat; + back_str = --str; /* Allow zero-length match */ + break; + case '[': { /* Character class */ + bool match = false, inverted = (*pat == '!'); + char const *class = pat + inverted; + unsigned char a = *class++; + + /* + * Iterate over each span in the character class. + * A span is either a single character a, or a + * range a-b. The first span may begin with ']'. + */ + do { + unsigned char b = a; + + if (a == '\0') /* Malformed */ + goto literal; + + if (class[0] == '-' && class[1] != ']') { + b = class[1]; + + if (b == '\0') + goto literal; + + class += 2; + /* Any special action if a > b? */ + } + match |= (a <= c && c <= b); + } while ((a = *class++) != ']'); + + if (match == inverted) + goto backtrack; + pat = class; + } + break; + case '\\': + d = *pat++; + /*FALLTHROUGH*/ + default: /* Literal character */ +literal: + if (c == d) { + if (d == '\0') + return true; + break; + } +backtrack: + if (c == '\0' || !back_pat) + return false; /* No point continuing */ + /* Try again from last *, one character later in str. */ + pat = back_pat; + str = ++back_str; + break; + } + } +} +EXPORT_SYMBOL(glob_match); + + +#ifdef CONFIG_GLOB_SELFTEST + +#include <linux/printk.h> +#include <linux/moduleparam.h> + +/* Boot with "glob.verbose=1" to show successful tests, too */ +static bool verbose = false; +module_param(verbose, bool, 0); + +struct glob_test { + char const *pat, *str; + bool expected; +}; + +static bool __pure __init test(char const *pat, char const *str, bool expected) +{ + bool match = glob_match(pat, str); + bool success = match == expected; + + /* Can't get string literals into a particular section, so... */ + static char const msg_error[] __initconst = + KERN_ERR "glob: \"%s\" vs. \"%s\": %s *** ERROR ***\n"; + static char const msg_ok[] __initconst = + KERN_DEBUG "glob: \"%s\" vs. \"%s\": %s OK\n"; + static char const mismatch[] __initconst = "mismatch"; + char const *message; + + if (!success) + message = msg_error; + else if (verbose) + message = msg_ok; + else + return success; + + printk(message, pat, str, mismatch + 3*match); + return success; +} + +/* + * The tests are all jammed together in one array to make it simpler + * to place that array in the .init.rodata section. The obvious + * "array of structures containing char *" has no way to force the + * pointed-to strings to be in a particular section. + * + * Anyway, a test consists of: + * 1. Expected glob_match result: '1' or '0'. + * 2. Pattern to match: null-terminated string + * 3. String to match against: null-terminated string + * + * The list of tests is terminated with a final '\0' instead of + * a glob_match result character. + */ +static char const glob_tests[] __initconst = + /* Some basic tests */ + "1" "a\0" "a\0" + "0" "a\0" "b\0" + "0" "a\0" "aa\0" + "0" "a\0" "\0" + "1" "\0" "\0" + "0" "\0" "a\0" + /* Simple character class tests */ + "1" "[a]\0" "a\0" + "0" "[a]\0" "b\0" + "0" "[!a]\0" "a\0" + "1" "[!a]\0" "b\0" + "1" "[ab]\0" "a\0" + "1" "[ab]\0" "b\0" + "0" "[ab]\0" "c\0" + "1" "[!ab]\0" "c\0" + "1" "[a-c]\0" "b\0" + "0" "[a-c]\0" "d\0" + /* Corner cases in character class parsing */ + "1" "[a-c-e-g]\0" "-\0" + "0" "[a-c-e-g]\0" "d\0" + "1" "[a-c-e-g]\0" "f\0" + "1" "[]a-ceg-ik[]\0" "a\0" + "1" "[]a-ceg-ik[]\0" "]\0" + "1" "[]a-ceg-ik[]\0" "[\0" + "1" "[]a-ceg-ik[]\0" "h\0" + "0" "[]a-ceg-ik[]\0" "f\0" + "0" "[!]a-ceg-ik[]\0" "h\0" + "0" "[!]a-ceg-ik[]\0" "]\0" + "1" "[!]a-ceg-ik[]\0" "f\0" + /* Simple wild cards */ + "1" "?\0" "a\0" + "0" "?\0" "aa\0" + "0" "??\0" "a\0" + "1" "?x?\0" "axb\0" + "0" "?x?\0" "abx\0" + "0" "?x?\0" "xab\0" + /* Asterisk wild cards (backtracking) */ + "0" "*??\0" "a\0" + "1" "*??\0" "ab\0" + "1" "*??\0" "abc\0" + "1" "*??\0" "abcd\0" + "0" "??*\0" "a\0" + "1" "??*\0" "ab\0" + "1" "??*\0" "abc\0" + "1" "??*\0" "abcd\0" + "0" "?*?\0" "a\0" + "1" "?*?\0" "ab\0" + "1" "?*?\0" "abc\0" + "1" "?*?\0" "abcd\0" + "1" "*b\0" "b\0" + "1" "*b\0" "ab\0" + "0" "*b\0" "ba\0" + "1" "*b\0" "bb\0" + "1" "*b\0" "abb\0" + "1" "*b\0" "bab\0" + "1" "*bc\0" "abbc\0" + "1" "*bc\0" "bc\0" + "1" "*bc\0" "bbc\0" + "1" "*bc\0" "bcbc\0" + /* Multiple asterisks (complex backtracking) */ + "1" "*ac*\0" "abacadaeafag\0" + "1" "*ac*ae*ag*\0" "abacadaeafag\0" + "1" "*a*b*[bc]*[ef]*g*\0" "abacadaeafag\0" + "0" "*a*b*[ef]*[cd]*g*\0" "abacadaeafag\0" + "1" "*abcd*\0" "abcabcabcabcdefg\0" + "1" "*ab*cd*\0" "abcabcabcabcdefg\0" + "1" "*abcd*abcdef*\0" "abcabcdabcdeabcdefg\0" + "0" "*abcd*\0" "abcabcabcabcefg\0" + "0" "*ab*cd*\0" "abcabcabcabcefg\0"; + +static int __init glob_init(void) +{ + unsigned successes = 0; + unsigned n = 0; + char const *p = glob_tests; + static char const message[] __initconst = + KERN_INFO "glob: %u self-tests passed, %u failed\n"; + + /* + * Tests are jammed together in a string. The first byte is '1' + * or '0' to indicate the expected outcome, or '\0' to indicate the + * end of the tests. Then come two null-terminated strings: the + * pattern and the string to match it against. + */ + while (*p) { + bool expected = *p++ & 1; + char const *pat = p; + + p += strlen(p) + 1; + successes += test(pat, p, expected); + p += strlen(p) + 1; + n++; + } + + n -= successes; + printk(message, successes, n); + + /* What's the errno for "kernel bug detected"? Guess... */ + return n ? -ECANCELED : 0; +} + +/* We need a dummy exit function to allow unload */ +static void __exit glob_fini(void) { } + +module_init(glob_init); +module_exit(glob_fini); + +#endif /* CONFIG_GLOB_SELFTEST */ diff --git a/lib/idr.c b/lib/idr.c index 39158abebad1..50be3fa9b657 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -590,26 +590,27 @@ static void __idr_remove_all(struct idr *idp) struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = idp->top; + *paa = idp->top; RCU_INIT_POINTER(idp->top, NULL); max = idr_max(idp->layers); id = 0; while (id >= 0 && id <= max) { + p = *paa; while (n > IDR_BITS && p) { n -= IDR_BITS; - *paa++ = p; p = p->ary[(id >> n) & IDR_MASK]; + *++paa = p; } bt_mask = id; id += 1 << n; /* Get the highest bit that the above add changed from 0->1. */ while (n < fls(id ^ bt_mask)) { - if (p) - free_layer(idp, p); + if (*paa) + free_layer(idp, *paa); n += IDR_BITS; - p = *--paa; + --paa; } } idp->layers = 0; @@ -692,15 +693,16 @@ int idr_for_each(struct idr *idp, struct idr_layer **paa = &pa[0]; n = idp->layers * IDR_BITS; - p = rcu_dereference_raw(idp->top); + *paa = rcu_dereference_raw(idp->top); max = idr_max(idp->layers); id = 0; while (id >= 0 && id <= max) { + p = *paa; while (n > 0 && p) { n -= IDR_BITS; - *paa++ = p; p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); + *++paa = p; } if (p) { @@ -712,7 +714,7 @@ int idr_for_each(struct idr *idp, id += 1 << n; while (n < fls(id)) { n += IDR_BITS; - p = *--paa; + --paa; } } @@ -740,17 +742,18 @@ void *idr_get_next(struct idr *idp, int *nextidp) int n, max; /* find first ent */ - p = rcu_dereference_raw(idp->top); + p = *paa = rcu_dereference_raw(idp->top); if (!p) return NULL; n = (p->layer + 1) * IDR_BITS; max = idr_max(p->layer + 1); while (id >= 0 && id <= max) { + p = *paa; while (n > 0 && p) { n -= IDR_BITS; - *paa++ = p; p = rcu_dereference_raw(p->ary[(id >> n) & IDR_MASK]); + *++paa = p; } if (p) { @@ -768,7 +771,7 @@ void *idr_get_next(struct idr *idp, int *nextidp) id = round_up(id + 1, 1 << n); while (n < fls(id)) { n += IDR_BITS; - p = *--paa; + --paa; } } return NULL; diff --git a/lib/kfifo.c b/lib/kfifo.c index d79b9d222065..90ba1eb1df06 100644 --- a/lib/kfifo.c +++ b/lib/kfifo.c @@ -561,8 +561,7 @@ EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - if (!nents) - BUG(); + BUG_ON(!nents); len = __kfifo_max_r(len, recsize); @@ -585,8 +584,7 @@ EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) { - if (!nents) - BUG(); + BUG_ON(!nents); len = __kfifo_max_r(len, recsize); diff --git a/lib/klist.c b/lib/klist.c index 358a368a2947..89b485a2a58d 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -140,11 +140,11 @@ void klist_add_tail(struct klist_node *n, struct klist *k) EXPORT_SYMBOL_GPL(klist_add_tail); /** - * klist_add_after - Init a klist_node and add it after an existing node + * klist_add_behind - Init a klist_node and add it after an existing node * @n: node we're adding. * @pos: node to put @n after */ -void klist_add_after(struct klist_node *n, struct klist_node *pos) +void klist_add_behind(struct klist_node *n, struct klist_node *pos) { struct klist *k = knode_klist(pos); @@ -153,7 +153,7 @@ void klist_add_after(struct klist_node *n, struct klist_node *pos) list_add(&n->n_node, &pos->n_node); spin_unlock(&k->k_lock); } -EXPORT_SYMBOL_GPL(klist_add_after); +EXPORT_SYMBOL_GPL(klist_add_behind); /** * klist_add_before - Init a klist_node and add it before an existing node diff --git a/lib/list_sort.c b/lib/list_sort.c index 1183fa70a44d..12bcba1c8612 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,3 +1,6 @@ + +#define pr_fmt(fmt) "list_sort_test: " fmt + #include <linux/kernel.h> #include <linux/module.h> #include <linux/list_sort.h> @@ -47,6 +50,7 @@ static void merge_and_restore_back_links(void *priv, struct list_head *a, struct list_head *b) { struct list_head *tail = head; + u8 count = 0; while (a && b) { /* if equal, take 'a' -- important for sort stability */ @@ -70,7 +74,8 @@ static void merge_and_restore_back_links(void *priv, * element comparison is needed, so the client's cmp() * routine can invoke cond_resched() periodically. */ - (*cmp)(priv, tail->next, tail->next); + if (unlikely(!(++count))) + (*cmp)(priv, tail->next, tail->next); tail->next->prev = tail; tail = tail->next; @@ -123,9 +128,7 @@ void list_sort(void *priv, struct list_head *head, } if (lev > max_lev) { if (unlikely(lev >= ARRAY_SIZE(part)-1)) { - printk_once(KERN_DEBUG "list passed to" - " list_sort() too long for" - " efficiency\n"); + printk_once(KERN_DEBUG "list too long for efficiency\n"); lev--; } max_lev = lev; @@ -168,27 +171,25 @@ static struct debug_el **elts __initdata; static int __init check(struct debug_el *ela, struct debug_el *elb) { if (ela->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - ela->serial); + pr_err("error: incorrect serial %d\n", ela->serial); return -EINVAL; } if (elb->serial >= TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: incorrect serial %d\n", - elb->serial); + pr_err("error: incorrect serial %d\n", elb->serial); return -EINVAL; } if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - printk(KERN_ERR "list_sort_test: error: phantom element\n"); + pr_err("error: phantom element\n"); return -EINVAL; } if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); return -EINVAL; } if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - printk(KERN_ERR "list_sort_test: error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); return -EINVAL; } return 0; @@ -207,25 +208,23 @@ static int __init cmp(void *priv, struct list_head *a, struct list_head *b) static int __init list_sort_test(void) { - int i, count = 1, err = -EINVAL; + int i, count = 1, err = -ENOMEM; struct debug_el *el; - struct list_head *cur, *tmp; + struct list_head *cur; LIST_HEAD(head); - printk(KERN_DEBUG "list_sort_test: start testing list_sort()\n"); + pr_debug("start testing list_sort()\n"); - elts = kmalloc(sizeof(void *) * TEST_LIST_LEN, GFP_KERNEL); + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); if (!elts) { - printk(KERN_ERR "list_sort_test: error: cannot allocate " - "memory\n"); - goto exit; + pr_err("error: cannot allocate memory\n"); + return err; } for (i = 0; i < TEST_LIST_LEN; i++) { el = kmalloc(sizeof(*el), GFP_KERNEL); if (!el) { - printk(KERN_ERR "list_sort_test: error: cannot " - "allocate memory\n"); + pr_err("error: cannot allocate memory\n"); goto exit; } /* force some equivalencies */ @@ -239,52 +238,52 @@ static int __init list_sort_test(void) list_sort(NULL, &head, cmp); + err = -EINVAL; for (cur = head.next; cur->next != &head; cur = cur->next) { struct debug_el *el1; int cmp_result; if (cur->next->prev != cur) { - printk(KERN_ERR "list_sort_test: error: list is " - "corrupted\n"); + pr_err("error: list is corrupted\n"); goto exit; } cmp_result = cmp(NULL, cur, cur->next); if (cmp_result > 0) { - printk(KERN_ERR "list_sort_test: error: list is not " - "sorted\n"); + pr_err("error: list is not sorted\n"); goto exit; } el = container_of(cur, struct debug_el, list); el1 = container_of(cur->next, struct debug_el, list); if (cmp_result == 0 && el->serial >= el1->serial) { - printk(KERN_ERR "list_sort_test: error: order of " - "equivalent elements not preserved\n"); + pr_err("error: order of equivalent elements not " + "preserved\n"); goto exit; } if (check(el, el1)) { - printk(KERN_ERR "list_sort_test: error: element check " - "failed\n"); + pr_err("error: element check failed\n"); goto exit; } count++; } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + if (count != TEST_LIST_LEN) { - printk(KERN_ERR "list_sort_test: error: bad list length %d", - count); + pr_err("error: bad list length %d", count); goto exit; } err = 0; exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); kfree(elts); - list_for_each_safe(cur, tmp, &head) { - list_del(cur); - kfree(container_of(cur, struct debug_el, list)); - } return err; } module_init(list_sort_test); diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 3a8e8e8fb2a5..4251cbd5becb 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -73,7 +73,7 @@ EXPORT_SYMBOL(sg_nents); **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN struct scatterlist *ret = &sgl[nents - 1]; #else struct scatterlist *sg, *ret = NULL; @@ -251,7 +251,7 @@ int __sg_alloc_table(struct sg_table *table, unsigned int nents, if (nents == 0) return -EINVAL; -#ifndef ARCH_HAS_SG_CHAIN +#ifndef CONFIG_ARCH_HAS_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif diff --git a/lib/string_helpers.c b/lib/string_helpers.c index ed5c1454dd62..29033f319aea 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -25,12 +25,15 @@ int string_get_size(u64 size, const enum string_size_units units, char *buf, int len) { - static const char *units_10[] = { "B", "kB", "MB", "GB", "TB", "PB", - "EB", "ZB", "YB", NULL}; - static const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", - "EiB", "ZiB", "YiB", NULL }; - static const char **units_str[] = { - [STRING_UNITS_10] = units_10, + static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", NULL + }; + static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB", + NULL + }; + static const char *const *const units_str[] = { + [STRING_UNITS_10] = units_10, [STRING_UNITS_2] = units_2, }; static const unsigned int divisor[] = { diff --git a/lib/test-kstrtox.c b/lib/test-kstrtox.c index bea3f3fa3f02..4137bca5f8e8 100644 --- a/lib/test-kstrtox.c +++ b/lib/test-kstrtox.c @@ -3,7 +3,7 @@ #include <linux/module.h> #define for_each_test(i, test) \ - for (i = 0; i < sizeof(test) / sizeof(test[0]); i++) + for (i = 0; i < ARRAY_SIZE(test); i++) struct test_fail { const char *str; diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 6fe2c84eb055..0eced40344dd 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1183,6 +1183,21 @@ char *address_val(char *buf, char *end, const void *addr, return number(buf, end, num, spec); } +static noinline_for_stack +char *comm_name(char *buf, char *end, struct task_struct *tsk, + struct printf_spec spec, const char *fmt) +{ + char name[TASK_COMM_LEN]; + + /* Caller can pass NULL instead of current. */ + if (!tsk) + tsk = current; + /* Not using get_task_comm() in case I'm in IRQ context. */ + memcpy(name, tsk->comm, TASK_COMM_LEN); + name[sizeof(name) - 1] = '\0'; + return string(buf, end, name, spec); +} + int kptr_restrict __read_mostly; /* @@ -1250,6 +1265,7 @@ int kptr_restrict __read_mostly; * (default assumed to be phys_addr_t, passed by reference) * - 'd[234]' For a dentry name (optionally 2-4 last components) * - 'D[234]' Same as 'd' but for a struct file + * - 'T' task_struct->comm * * Note: The difference between 'S' and 'F' is that on ia64 and ppc64 * function pointers are really function descriptors, which contain a @@ -1261,7 +1277,7 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, { int default_width = 2 * sizeof(void *) + (spec.flags & SPECIAL ? 2 : 0); - if (!ptr && *fmt != 'K') { + if (!ptr && *fmt != 'K' && *fmt != 'T') { /* * Print (null) with the same width as a pointer so it makes * tabular output look nice. @@ -1389,6 +1405,8 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, return dentry_name(buf, end, ((const struct file *)ptr)->f_path.dentry, spec, fmt); + case 'T': + return comm_name(buf, end, ptr, spec, fmt); } spec.flags |= SMALL; if (spec.field_width == -1) { diff --git a/lib/zlib_deflate/deflate.c b/lib/zlib_deflate/deflate.c index d63381e8e333..d20ef458f137 100644 --- a/lib/zlib_deflate/deflate.c +++ b/lib/zlib_deflate/deflate.c @@ -250,52 +250,6 @@ int zlib_deflateInit2( } /* ========================================================================= */ -#if 0 -int zlib_deflateSetDictionary( - z_streamp strm, - const Byte *dictionary, - uInt dictLength -) -{ - deflate_state *s; - uInt length = dictLength; - uInt n; - IPos hash_head = 0; - - if (strm == NULL || strm->state == NULL || dictionary == NULL) - return Z_STREAM_ERROR; - - s = (deflate_state *) strm->state; - if (s->status != INIT_STATE) return Z_STREAM_ERROR; - - strm->adler = zlib_adler32(strm->adler, dictionary, dictLength); - - if (length < MIN_MATCH) return Z_OK; - if (length > MAX_DIST(s)) { - length = MAX_DIST(s); -#ifndef USE_DICT_HEAD - dictionary += dictLength - length; /* use the tail of the dictionary */ -#endif - } - memcpy((char *)s->window, dictionary, length); - s->strstart = length; - s->block_start = (long)length; - - /* Insert all strings in the hash table (except for the last two bytes). - * s->lookahead stays null, so s->ins_h will be recomputed at the next - * call of fill_window. - */ - s->ins_h = s->window[0]; - UPDATE_HASH(s, s->ins_h, s->window[1]); - for (n = 0; n <= length - MIN_MATCH; n++) { - INSERT_STRING(s, n, hash_head); - } - if (hash_head) hash_head = 0; /* to make compiler happy */ - return Z_OK; -} -#endif /* 0 */ - -/* ========================================================================= */ int zlib_deflateReset( z_streamp strm ) @@ -326,45 +280,6 @@ int zlib_deflateReset( return Z_OK; } -/* ========================================================================= */ -#if 0 -int zlib_deflateParams( - z_streamp strm, - int level, - int strategy -) -{ - deflate_state *s; - compress_func func; - int err = Z_OK; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - s = (deflate_state *) strm->state; - - if (level == Z_DEFAULT_COMPRESSION) { - level = 6; - } - if (level < 0 || level > 9 || strategy < 0 || strategy > Z_HUFFMAN_ONLY) { - return Z_STREAM_ERROR; - } - func = configuration_table[s->level].func; - - if (func != configuration_table[level].func && strm->total_in != 0) { - /* Flush the last buffer: */ - err = zlib_deflate(strm, Z_PARTIAL_FLUSH); - } - if (s->level != level) { - s->level = level; - s->max_lazy_match = configuration_table[level].max_lazy; - s->good_match = configuration_table[level].good_length; - s->nice_match = configuration_table[level].nice_length; - s->max_chain_length = configuration_table[level].max_chain; - } - s->strategy = strategy; - return err; -} -#endif /* 0 */ - /* ========================================================================= * Put a short in the pending buffer. The 16-bit value is put in MSB order. * IN assertion: the stream state is correct and there is enough room in @@ -568,64 +483,6 @@ int zlib_deflateEnd( return status == BUSY_STATE ? Z_DATA_ERROR : Z_OK; } -/* ========================================================================= - * Copy the source state to the destination state. - */ -#if 0 -int zlib_deflateCopy ( - z_streamp dest, - z_streamp source -) -{ -#ifdef MAXSEG_64K - return Z_STREAM_ERROR; -#else - deflate_state *ds; - deflate_state *ss; - ush *overlay; - deflate_workspace *mem; - - - if (source == NULL || dest == NULL || source->state == NULL) { - return Z_STREAM_ERROR; - } - - ss = (deflate_state *) source->state; - - *dest = *source; - - mem = (deflate_workspace *) dest->workspace; - - ds = &(mem->deflate_memory); - - dest->state = (struct internal_state *) ds; - *ds = *ss; - ds->strm = dest; - - ds->window = (Byte *) mem->window_memory; - ds->prev = (Pos *) mem->prev_memory; - ds->head = (Pos *) mem->head_memory; - overlay = (ush *) mem->overlay_memory; - ds->pending_buf = (uch *) overlay; - - memcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte)); - memcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos)); - memcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos)); - memcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size); - - ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf); - ds->d_buf = overlay + ds->lit_bufsize/sizeof(ush); - ds->l_buf = ds->pending_buf + (1+sizeof(ush))*ds->lit_bufsize; - - ds->l_desc.dyn_tree = ds->dyn_ltree; - ds->d_desc.dyn_tree = ds->dyn_dtree; - ds->bl_desc.dyn_tree = ds->bl_tree; - - return Z_OK; -#endif -} -#endif /* 0 */ - /* =========================================================================== * Read a new buffer from the current input stream, update the adler32 * and total number of bytes read. All deflate() input goes through diff --git a/lib/zlib_inflate/inflate.c b/lib/zlib_inflate/inflate.c index f5ce87b0800e..58a733b10387 100644 --- a/lib/zlib_inflate/inflate.c +++ b/lib/zlib_inflate/inflate.c @@ -45,21 +45,6 @@ int zlib_inflateReset(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflatePrime(z_streamp strm, int bits, int value) -{ - struct inflate_state *state; - - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR; - value &= (1L << bits) - 1; - state->hold += value << state->bits; - state->bits += bits; - return Z_OK; -} -#endif - int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; @@ -761,123 +746,6 @@ int zlib_inflateEnd(z_streamp strm) return Z_OK; } -#if 0 -int zlib_inflateSetDictionary(z_streamp strm, const Byte *dictionary, - uInt dictLength) -{ - struct inflate_state *state; - unsigned long id; - - /* check state */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (state->wrap != 0 && state->mode != DICT) - return Z_STREAM_ERROR; - - /* check for correct dictionary id */ - if (state->mode == DICT) { - id = zlib_adler32(0L, NULL, 0); - id = zlib_adler32(id, dictionary, dictLength); - if (id != state->check) - return Z_DATA_ERROR; - } - - /* copy dictionary to window */ - zlib_updatewindow(strm, strm->avail_out); - - if (dictLength > state->wsize) { - memcpy(state->window, dictionary + dictLength - state->wsize, - state->wsize); - state->whave = state->wsize; - } - else { - memcpy(state->window + state->wsize - dictLength, dictionary, - dictLength); - state->whave = dictLength; - } - state->havedict = 1; - return Z_OK; -} -#endif - -#if 0 -/* - Search buf[0..len-1] for the pattern: 0, 0, 0xff, 0xff. Return when found - or when out of input. When called, *have is the number of pattern bytes - found in order so far, in 0..3. On return *have is updated to the new - state. If on return *have equals four, then the pattern was found and the - return value is how many bytes were read including the last byte of the - pattern. If *have is less than four, then the pattern has not been found - yet and the return value is len. In the latter case, zlib_syncsearch() can be - called again with more data and the *have state. *have is initialized to - zero for the first call. - */ -static unsigned zlib_syncsearch(unsigned *have, unsigned char *buf, - unsigned len) -{ - unsigned got; - unsigned next; - - got = *have; - next = 0; - while (next < len && got < 4) { - if ((int)(buf[next]) == (got < 2 ? 0 : 0xff)) - got++; - else if (buf[next]) - got = 0; - else - got = 4 - got; - next++; - } - *have = got; - return next; -} -#endif - -#if 0 -int zlib_inflateSync(z_streamp strm) -{ - unsigned len; /* number of bytes to look at or looked at */ - unsigned long in, out; /* temporary to save total_in and total_out */ - unsigned char buf[4]; /* to restore bit buffer to byte string */ - struct inflate_state *state; - - /* check parameters */ - if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; - state = (struct inflate_state *)strm->state; - if (strm->avail_in == 0 && state->bits < 8) return Z_BUF_ERROR; - - /* if first time, start search in bit buffer */ - if (state->mode != SYNC) { - state->mode = SYNC; - state->hold <<= state->bits & 7; - state->bits -= state->bits & 7; - len = 0; - while (state->bits >= 8) { - buf[len++] = (unsigned char)(state->hold); - state->hold >>= 8; - state->bits -= 8; - } - state->have = 0; - zlib_syncsearch(&(state->have), buf, len); - } - - /* search available input */ - len = zlib_syncsearch(&(state->have), strm->next_in, strm->avail_in); - strm->avail_in -= len; - strm->next_in += len; - strm->total_in += len; - - /* return no joy or set up to restart inflate() on a new block */ - if (state->have != 4) return Z_DATA_ERROR; - in = strm->total_in; out = strm->total_out; - zlib_inflateReset(strm); - strm->total_in = in; strm->total_out = out; - state->mode = TYPE; - return Z_OK; -} -#endif - /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; diff --git a/mm/Kconfig b/mm/Kconfig index 3e9977a9d657..886db2158538 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -508,21 +508,34 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. -config ZBUD - tristate - default n +config CMA_AREAS + int "Maximum count of the CMA areas" + depends on CMA + default 7 help - A special purpose allocator for storing compressed pages. - It is designed to store up to two compressed pages per physical - page. While this design limits storage density, it has simple and - deterministic reclaim properties that make it preferable to a higher - density approach when reclaim will be used. + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value "7". + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. config ZSWAP bool "Compressed cache for swap pages (EXPERIMENTAL)" depends on FRONTSWAP && CRYPTO=y select CRYPTO_LZO - select ZBUD + select ZPOOL default n help A lightweight compressed cache for swap pages. It takes @@ -538,17 +551,22 @@ config ZSWAP they have not be fully explored on the large set of potential configurations and workloads that exist. -config MEM_SOFT_DIRTY - bool "Track memory changes" - depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS - select PROC_PAGE_MONITOR +config ZPOOL + tristate "Common API for compressed memory storage" + default n help - This option enables memory changes tracking by introducing a - soft-dirty bit on pte-s. This bit it set when someone writes - into a page just as regular dirty bit, but unlike the latter - it can be cleared by hands. + Compressed memory storage API. This allows using either zbud or + zsmalloc. - See Documentation/vm/soft-dirty.txt for more details. +config ZBUD + tristate "Low density storage for compressed pages" + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. config ZSMALLOC tristate "Memory allocator for compressed pages" diff --git a/mm/Makefile b/mm/Makefile index 4064f3ec145e..632ae77e6070 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -59,6 +59,8 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZPOOL) += zpool.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o +obj-$(CONFIG_CMA) += cma.o diff --git a/mm/cma.c b/mm/cma.c new file mode 100644 index 000000000000..c17751c0dcaf --- /dev/null +++ b/mm/cma.c @@ -0,0 +1,335 @@ +/* + * Contiguous Memory Allocator + * + * Copyright (c) 2010-2011 by Samsung Electronics. + * Copyright IBM Corporation, 2013 + * Copyright LG Electronics Inc., 2014 + * Written by: + * Marek Szyprowski <m.szyprowski@samsung.com> + * Michal Nazarewicz <mina86@mina86.com> + * Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * Joonsoo Kim <iamjoonsoo.kim@lge.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + */ + +#define pr_fmt(fmt) "cma: " fmt + +#ifdef CONFIG_CMA_DEBUG +#ifndef DEBUG +# define DEBUG +#endif +#endif + +#include <linux/memblock.h> +#include <linux/err.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/sizes.h> +#include <linux/slab.h> +#include <linux/log2.h> +#include <linux/cma.h> + +struct cma { + unsigned long base_pfn; + unsigned long count; + unsigned long *bitmap; + unsigned int order_per_bit; /* Order of pages represented by one bit */ + struct mutex lock; +}; + +static struct cma cma_areas[MAX_CMA_AREAS]; +static unsigned cma_area_count; +static DEFINE_MUTEX(cma_mutex); + +phys_addr_t cma_get_base(struct cma *cma) +{ + return PFN_PHYS(cma->base_pfn); +} + +unsigned long cma_get_size(struct cma *cma) +{ + return cma->count << PAGE_SHIFT; +} + +static unsigned long cma_bitmap_aligned_mask(struct cma *cma, int align_order) +{ + return (1UL << (align_order >> cma->order_per_bit)) - 1; +} + +static unsigned long cma_bitmap_maxno(struct cma *cma) +{ + return cma->count >> cma->order_per_bit; +} + +static unsigned long cma_bitmap_pages_to_bits(struct cma *cma, + unsigned long pages) +{ + return ALIGN(pages, 1UL << cma->order_per_bit) >> cma->order_per_bit; +} + +static void cma_clear_bitmap(struct cma *cma, unsigned long pfn, int count) +{ + unsigned long bitmap_no, bitmap_count; + + bitmap_no = (pfn - cma->base_pfn) >> cma->order_per_bit; + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + mutex_lock(&cma->lock); + bitmap_clear(cma->bitmap, bitmap_no, bitmap_count); + mutex_unlock(&cma->lock); +} + +static int __init cma_activate_area(struct cma *cma) +{ + int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long); + unsigned long base_pfn = cma->base_pfn, pfn = base_pfn; + unsigned i = cma->count >> pageblock_order; + struct zone *zone; + + cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL); + + if (!cma->bitmap) + return -ENOMEM; + + WARN_ON_ONCE(!pfn_valid(pfn)); + zone = page_zone(pfn_to_page(pfn)); + + do { + unsigned j; + + base_pfn = pfn; + for (j = pageblock_nr_pages; j; --j, pfn++) { + WARN_ON_ONCE(!pfn_valid(pfn)); + /* + * alloc_contig_range requires the pfn range + * specified to be in the same zone. Make this + * simple by forcing the entire CMA resv range + * to be in the same zone. + */ + if (page_zone(pfn_to_page(pfn)) != zone) + goto err; + } + init_cma_reserved_pageblock(pfn_to_page(base_pfn)); + } while (--i); + + mutex_init(&cma->lock); + return 0; + +err: + kfree(cma->bitmap); + return -EINVAL; +} + +static int __init cma_init_reserved_areas(void) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = cma_activate_area(&cma_areas[i]); + + if (ret) + return ret; + } + + return 0; +} +core_initcall(cma_init_reserved_areas); + +/** + * cma_declare_contiguous() - reserve custom contiguous area + * @base: Base address of the reserved area optional, use 0 for any + * @size: Size of the reserved area (in bytes), + * @limit: End address of the reserved memory (optional, 0 for any). + * @alignment: Alignment for the CMA area, should be power of 2 or zero + * @order_per_bit: Order of pages represented by one bit on bitmap. + * @fixed: hint about where to place the reserved area + * @res_cma: Pointer to store the created cma region. + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the early allocator (memblock or bootmem) + * has been activated and all other subsystems have already allocated/reserved + * memory. This function allows to create custom reserved areas. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. + */ +int __init cma_declare_contiguous(phys_addr_t base, + phys_addr_t size, phys_addr_t limit, + phys_addr_t alignment, unsigned int order_per_bit, + bool fixed, struct cma **res_cma) +{ + struct cma *cma; + int ret = 0; + + pr_debug("%s(size %lx, base %08lx, limit %08lx alignment %08lx)\n", + __func__, (unsigned long)size, (unsigned long)base, + (unsigned long)limit, (unsigned long)alignment); + + if (cma_area_count == ARRAY_SIZE(cma_areas)) { + pr_err("Not enough slots for CMA reserved regions!\n"); + return -ENOSPC; + } + + if (!size) + return -EINVAL; + + if (alignment && !is_power_of_2(alignment)) + return -EINVAL; + + /* + * Sanitise input arguments. + * Pages both ends in CMA area could be merged into adjacent unmovable + * migratetype page by page allocator's buddy algorithm. In the case, + * you couldn't get a contiguous memory, which is not what we want. + */ + alignment = max(alignment, + (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + base = ALIGN(base, alignment); + size = ALIGN(size, alignment); + limit &= ~(alignment - 1); + + /* size should be aligned with order_per_bit */ + if (!IS_ALIGNED(size >> PAGE_SHIFT, 1 << order_per_bit)) + return -EINVAL; + + /* Reserve memory */ + if (base && fixed) { + if (memblock_is_region_reserved(base, size) || + memblock_reserve(base, size) < 0) { + ret = -EBUSY; + goto err; + } + } else { + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); + if (!addr) { + ret = -ENOMEM; + goto err; + } else { + base = addr; + } + } + + /* + * Each reserved area must be initialised later, when more kernel + * subsystems (like slab allocator) are available. + */ + cma = &cma_areas[cma_area_count]; + cma->base_pfn = PFN_DOWN(base); + cma->count = size >> PAGE_SHIFT; + cma->order_per_bit = order_per_bit; + *res_cma = cma; + cma_area_count++; + + pr_info("Reserved %ld MiB at %08lx\n", (unsigned long)size / SZ_1M, + (unsigned long)base); + return 0; + +err: + pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M); + return ret; +} + +/** + * cma_alloc() - allocate pages from contiguous area + * @cma: Contiguous memory region for which the allocation is performed. + * @count: Requested number of pages. + * @align: Requested alignment of pages (in PAGE_SIZE order). + * + * This function allocates part of contiguous memory on specific + * contiguous memory area. + */ +struct page *cma_alloc(struct cma *cma, int count, unsigned int align) +{ + unsigned long mask, pfn, start = 0; + unsigned long bitmap_maxno, bitmap_no, bitmap_count; + struct page *page = NULL; + int ret; + + if (!cma || !cma->count) + return NULL; + + pr_debug("%s(cma %p, count %d, align %d)\n", __func__, (void *)cma, + count, align); + + if (!count) + return NULL; + + mask = cma_bitmap_aligned_mask(cma, align); + bitmap_maxno = cma_bitmap_maxno(cma); + bitmap_count = cma_bitmap_pages_to_bits(cma, count); + + for (;;) { + mutex_lock(&cma->lock); + bitmap_no = bitmap_find_next_zero_area(cma->bitmap, + bitmap_maxno, start, bitmap_count, mask); + if (bitmap_no >= bitmap_maxno) { + mutex_unlock(&cma->lock); + break; + } + bitmap_set(cma->bitmap, bitmap_no, bitmap_count); + /* + * It's safe to drop the lock here. We've marked this region for + * our exclusive use. If the migration fails we will take the + * lock again and unmark it. + */ + mutex_unlock(&cma->lock); + + pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit); + mutex_lock(&cma_mutex); + ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA); + mutex_unlock(&cma_mutex); + if (ret == 0) { + page = pfn_to_page(pfn); + break; + } + + cma_clear_bitmap(cma, pfn, count); + if (ret != -EBUSY) + break; + + pr_debug("%s(): memory range at %p is busy, retrying\n", + __func__, pfn_to_page(pfn)); + /* try again with a bit different memory target */ + start = bitmap_no + mask + 1; + } + + pr_debug("%s(): returned %p\n", __func__, page); + return page; +} + +/** + * cma_release() - release allocated pages + * @cma: Contiguous memory region for which the allocation is performed. + * @pages: Allocated pages. + * @count: Number of allocated pages. + * + * This function releases memory allocated by alloc_cma(). + * It returns false when provided pages do not belong to contiguous area and + * true otherwise. + */ +bool cma_release(struct cma *cma, struct page *pages, int count) +{ + unsigned long pfn; + + if (!cma || !pages) + return false; + + pr_debug("%s(page %p)\n", __func__, (void *)pages); + + pfn = page_to_pfn(pages); + + if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) + return false; + + VM_BUG_ON(pfn + count > cma->base_pfn + cma->count); + + free_contig_range(pfn, count); + cma_clear_bitmap(cma, pfn, count); + + return true; +} diff --git a/mm/compaction.c b/mm/compaction.c index 21bf292b642a..51750197db11 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -325,14 +325,14 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); - total_isolated += isolated; - for (i = 0; i < isolated; i++) { - list_add(&page->lru, freelist); - page++; - } - - /* If a page was split, advance to the end of it */ if (isolated) { + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ blockpfn += isolated - 1; cursor += isolated - 1; continue; @@ -341,9 +341,6 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, isolate_fail: if (strict) break; - else - continue; - } trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); diff --git a/mm/filemap.c b/mm/filemap.c index d175917e2411..21a4df0e080f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -233,7 +233,6 @@ void delete_from_page_cache(struct page *page) spin_lock_irq(&mapping->tree_lock); __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage) freepage(page); @@ -489,8 +488,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) if (PageSwapBacked(new)) __inc_zone_page_state(new, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); - /* mem_cgroup codes must not be called under tree_lock */ - mem_cgroup_replace_page_cache(old, new); + mem_cgroup_migrate(old, new, true); radix_tree_preload_end(); if (freepage) freepage(old); @@ -548,19 +546,19 @@ static int __add_to_page_cache_locked(struct page *page, pgoff_t offset, gfp_t gfp_mask, void **shadowp) { + struct mem_cgroup *memcg; int error; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapBacked(page), page); - error = mem_cgroup_charge_file(page, current->mm, - gfp_mask & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp_mask, &memcg); if (error) return error; error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM); if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); return error; } @@ -575,13 +573,14 @@ static int __add_to_page_cache_locked(struct page *page, goto err_insert; __inc_zone_page_state(page, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_commit_charge(page, memcg, false); trace_mm_filemap_add_to_page_cache(page); return 0; err_insert: page->mapping = NULL; /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); return error; } @@ -808,6 +807,17 @@ int __lock_page_killable(struct page *page) } EXPORT_SYMBOL_GPL(__lock_page_killable); +/* + * Return values: + * 1 - page is locked; mmap_sem is still held. + * 0 - page is not locked. + * mmap_sem has been released (up_read()), unless flags had both + * FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in + * which case mmap_sem is still held. + * + * If neither ALLOW_RETRY nor KILLABLE are set, will always return 1 + * with the page locked and the mmap_sem unperturbed. + */ int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { @@ -1088,9 +1098,9 @@ no_page: if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) fgp_flags |= FGP_LOCK; - /* Init accessed so avoit atomic mark_page_accessed later */ + /* Init accessed so avoid atomic mark_page_accessed later */ if (fgp_flags & FGP_ACCESSED) - init_page_accessed(page); + __SetPageReferenced(page); err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { @@ -1824,6 +1834,18 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * The goto's are kind of ugly, but this streamlines the normal case of having * it in the page cache, and handles the special cases reasonably without * having a lot of duplicated code. + * + * vma->vm_mm->mmap_sem must be held on entry. + * + * If our return value has VM_FAULT_RETRY set, it's because + * lock_page_or_retry() returned 0. + * The mmap_sem has usually been released in this case. + * See __lock_page_or_retry() for the exception. + * + * If our return value does not have VM_FAULT_RETRY set, the mmap_sem + * has not been released. + * + * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -258,6 +258,11 @@ unmap: return ret; } +/* + * mmap_sem must be held on entry. If @nonblocking != NULL and + * *@flags does not include FOLL_NOWAIT, the mmap_sem may be released. + * If it is, *@nonblocking will be set to 0 and -EBUSY returned. + */ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long address, unsigned int *flags, int *nonblocking) { @@ -373,7 +378,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. * - * Must be called with mmap_sem held for read or write. + * Must be called with mmap_sem held. It may be released. See below. * * __get_user_pages walks a process's page tables and takes a reference to * each struct page that each user address corresponds to at a given @@ -396,7 +401,14 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) * * If @nonblocking != NULL, __get_user_pages will not wait for disk IO * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. + * *@nonblocking will be set to 0. Further, if @gup_flags does not + * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in + * this case. + * + * A caller using such a combination of @nonblocking and @gup_flags + * must therefore hold the mmap_sem for reading only, and recognize + * when it's been released. Otherwise, it must be held for either + * reading or writing and will not be released. * * In most cases, get_user_pages or get_user_pages_fast should be used * instead of __get_user_pages. __get_user_pages should be used only if @@ -528,7 +540,7 @@ EXPORT_SYMBOL(__get_user_pages); * such architectures, gup() will not be enough to make a subsequent access * succeed. * - * This should be called with the mm_sem held for read. + * This has the same semantics wrt the @mm->mmap_sem as does filemap_fault(). */ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 33514d88fef9..02559efd9827 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { + struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + return VM_FAULT_OOM; + pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); return VM_FAULT_OOM; + } clear_huge_page(page, haddr, HPAGE_PMD_NR); /* @@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(mm, pgtable); } else { @@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_KERNEL))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { - mem_cgroup_uncharge_page(page); put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + struct mem_cgroup *memcg; spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; @@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_charge_anon(pages[i], mm, - GFP_KERNEL))) { + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { if (pages[i]) put_page(pages[i]); - mem_cgroup_uncharge_start(); while (--i >= 0) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; } + set_page_private(pages[i], (unsigned long)memcg); } for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pte_t *pte, entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1065,12 +1074,12 @@ out: out_free_pages: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -1081,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1132,7 +1142,8 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1161,7 +1172,7 @@ alloc: put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); put_page(new_page); goto out_mn; } else { @@ -1170,6 +1181,8 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); if (!page) { @@ -1681,7 +1694,7 @@ static void __split_huge_page_refcount(struct page *page, &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ - smp_mb(); + smp_mb__after_atomic(); /* * retain hwpoison flag of the poisoned tail page: @@ -1775,6 +1788,8 @@ static int __split_huge_page_map(struct page *page, if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); + if (pmd_write(*pmd)) + BUG_ON(page_mapcount(page) != 1); haddr = address; for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1784,8 +1799,6 @@ static int __split_huge_page_map(struct page *page, entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (!pmd_write(*pmd)) entry = pte_wrprotect(entry); - else - BUG_ON(page_mapcount(page) != 1); if (!pmd_young(*pmd)) entry = pte_mkold(entry); if (pmd_numa(*pmd)) @@ -2389,6 +2402,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; + struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2399,7 +2413,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) return; /* @@ -2486,6 +2501,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -2499,7 +2516,7 @@ out_up_write: return; out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2024bbd573d2..a8d4155eb019 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,6 @@ #include <linux/node.h> #include "internal.h" -const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -1734,21 +1733,13 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t len) +static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, + struct hstate *h, int nid, + unsigned long count, size_t len) { int err; - int nid; - unsigned long count; - struct hstate *h; NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = kstrtoul(buf, 10, &count); - if (err) - goto out; - - h = kobj_to_hstate(kobj, &nid); if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; @@ -1784,6 +1775,23 @@ out: return err; } +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, const char *buf, + size_t len) +{ + struct hstate *h; + unsigned long count; + int nid; + int err; + + err = kstrtoul(buf, 10, &count); + if (err) + return err; + + h = kobj_to_hstate(kobj, &nid); + return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); +} + static ssize_t nr_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -1793,7 +1801,7 @@ static ssize_t nr_hugepages_show(struct kobject *kobj, static ssize_t nr_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(false, kobj, attr, buf, len); + return nr_hugepages_store_common(false, kobj, buf, len); } HSTATE_ATTR(nr_hugepages); @@ -1812,7 +1820,7 @@ static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) { - return nr_hugepages_store_common(true, kobj, attr, buf, len); + return nr_hugepages_store_common(true, kobj, buf, len); } HSTATE_ATTR(nr_hugepages_mempolicy); #endif @@ -2248,36 +2256,21 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; - unsigned long tmp; + unsigned long tmp = h->max_huge_pages; int ret; if (!hugepages_supported()) return -ENOTSUPP; - tmp = h->max_huge_pages; - - if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) - return -EINVAL; - table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); if (ret) goto out; - if (write) { - NODEMASK_ALLOC(nodemask_t, nodes_allowed, - GFP_KERNEL | __GFP_NORETRY); - if (!(obey_mempolicy && - init_nodemask_of_mempolicy(nodes_allowed))) { - NODEMASK_FREE(nodes_allowed); - nodes_allowed = &node_states[N_MEMORY]; - } - h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - - if (nodes_allowed != &node_states[N_MEMORY]) - NODEMASK_FREE(nodes_allowed); - } + if (write) + ret = __nr_hugepages_store_common(obey_mempolicy, h, + NUMA_NO_NODE, tmp, *length); out: return ret; } @@ -2753,8 +2746,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, - struct page *page, unsigned long address) +static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; @@ -2793,8 +2786,6 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); - - return 1; } /* @@ -2809,7 +2800,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; - int outside_reserve = 0; + int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2839,14 +2830,14 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ + /* + * Drop page table lock as buddy allocator may be called. It will + * be acquired again before returning to the caller, as expected. + */ spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { - long err = PTR_ERR(new_page); - page_cache_release(old_page); - /* * If a process owning a MAP_PRIVATE mapping fails to COW, * it is due to references held by a child and an insufficient @@ -2855,29 +2846,25 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + page_cache_release(old_page); BUG_ON(huge_pte_none(pte)); - if (unmap_ref_private(mm, vma, old_page, address)) { - BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); - ptep = huge_pte_offset(mm, address & huge_page_mask(h)); - if (likely(ptep && - pte_same(huge_ptep_get(ptep), pte))) - goto retry_avoidcopy; - /* - * race occurs while re-acquiring page table - * lock, and our job is done. - */ - return 0; - } - WARN_ON_ONCE(1); + unmap_ref_private(mm, vma, old_page, address); + BUG_ON(huge_pte_none(pte)); + spin_lock(ptl); + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(ptep && + pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page table + * lock, and our job is done. + */ + return 0; } - /* Caller expects lock to be held */ - spin_lock(ptl); - if (err == -ENOMEM) - return VM_FAULT_OOM; - else - return VM_FAULT_SIGBUS; + ret = (PTR_ERR(new_page) == -ENOMEM) ? + VM_FAULT_OOM : VM_FAULT_SIGBUS; + goto out_release_old; } /* @@ -2885,11 +2872,8 @@ retry_avoidcopy: * anon_vma prepared. */ if (unlikely(anon_vma_prepare(vma))) { - page_cache_release(new_page); - page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return VM_FAULT_OOM; + ret = VM_FAULT_OOM; + goto out_release_all; } copy_user_huge_page(new_page, old_page, address, vma, @@ -2899,6 +2883,7 @@ retry_avoidcopy: mmun_start = address & huge_page_mask(h); mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + /* * Retake the page table lock to check for racing updates * before the page tables are altered @@ -2919,12 +2904,13 @@ retry_avoidcopy: } spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); +out_release_all: page_cache_release(new_page); +out_release_old: page_cache_release(old_page); - /* Caller expects lock to be held */ - spin_lock(ptl); - return 0; + spin_lock(ptl); /* Caller expects lock to be held */ + return ret; } /* Return the pagecache page at a given address within a VMA */ diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 95487c71cad5..329caf56df22 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -72,8 +72,7 @@ DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); static void pfn_inject_exit(void) { - if (hwpoison_dir) - debugfs_remove_recursive(hwpoison_dir); + debugfs_remove_recursive(hwpoison_dir); } static int pfn_inject_init(void) diff --git a/mm/internal.h b/mm/internal.h index 7f22a11fcc66..a1b651b11c5f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -247,7 +247,7 @@ static inline void mlock_migrate_page(struct page *new, struct page *old) { } static inline struct page *mem_map_offset(struct page *base, int offset) { if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return pfn_to_page(page_to_pfn(base) + offset); + return nth_page(base, offset); return base + offset; } diff --git a/mm/madvise.c b/mm/madvise.c index a402f8fdc68e..0938b30da4ab 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -292,9 +292,6 @@ static long madvise_dontneed(struct vm_area_struct *vma, /* * Application wants to free up the pages and associated backing store. * This is effectively punching a hole into the middle of a file. - * - * NOTE: Currently, only shmfs/tmpfs is supported for this operation. - * Other filesystems return -ENOSYS. */ static long madvise_remove(struct vm_area_struct *vma, struct vm_area_struct **prev, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 45c10c6fc3ce..85dd94f2ecce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -754,9 +754,11 @@ static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } @@ -779,7 +781,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * mem is over its softlimit. */ if (excess || mz->on_tree) { - spin_lock(&mctz->lock); + unsigned long flags; + + spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); @@ -788,7 +792,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irqrestore(&mctz->lock, flags); } } } @@ -839,9 +843,9 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) { struct mem_cgroup_per_zone *mz; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); return mz; } @@ -882,13 +886,6 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg, return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - bool charge) -{ - int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); -} - static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { @@ -909,13 +906,13 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, - bool anon, int nr_pages) + int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ - if (anon) + if (PageAnon(page)) __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); else @@ -1013,7 +1010,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, */ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { - preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { @@ -1026,8 +1022,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif - preempt_enable(); - mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); @@ -1035,8 +1029,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif - } else - preempt_enable(); + } } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1347,20 +1340,6 @@ out: return lruvec; } -/* - * Following LRU functions are allowed to be used without PCG_LOCK. - * Operations are called by routine of global LRU independently from memcg. - * What we have to take care of here is validness of pc->mem_cgroup. - * - * Changes to pc->mem_cgroup happens when - * 1. charge - * 2. moving account - * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. - * It is added to LRU before charge. - * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. - * When moving account, the page is not on LRU. It's isolated. - */ - /** * mem_cgroup_page_lruvec - return lruvec for adding an lru page * @page: the page @@ -2261,22 +2240,14 @@ cleanup: * * Notes: Race condition * - * We usually use lock_page_cgroup() for accessing page_cgroup member but - * it tends to be costly. But considering some conditions, we doesn't need - * to do so _always_. + * Charging occurs during page instantiation, while the page is + * unmapped and locked in page migration, or while the page table is + * locked in THP migration. No race is possible. * - * Considering "charge", lock_page_cgroup() is not required because all - * file-stat operations happen after a page is attached to radix-tree. There - * are no race with "charge". + * Uncharge happens to pages with zero references, no race possible. * - * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup - * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even - * if there are race with "uncharge". Statistics itself is properly handled - * by flags. - * - * Considering "move", this is an only case we see a race. To make the race - * small, we check memcg->moving_account and detect there are possibility - * of race or not. If there is, we take a lock. + * Charge moving between groups is protected by checking mm->moving + * account and taking the move_lock in the slowpath. */ void __mem_cgroup_begin_update_page_stat(struct page *page, @@ -2551,55 +2522,63 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) +static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, + unsigned int nr_pages) { - unsigned long csize = nr_pages * PAGE_SIZE; + unsigned int batch = max(CHARGE_BATCH, nr_pages); + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct res_counter *fail_res; + unsigned long nr_reclaimed; unsigned long flags = 0; - int ret; + unsigned long long size; + int ret = 0; - ret = res_counter_charge(&memcg->res, csize, &fail_res); +retry: + if (consume_stock(memcg, nr_pages)) + goto done; - if (likely(!ret)) { + size = batch * PAGE_SIZE; + if (!res_counter_charge(&memcg->res, size, &fail_res)) { if (!do_swap_account) - return CHARGE_OK; - ret = res_counter_charge(&memcg->memsw, csize, &fail_res); - if (likely(!ret)) - return CHARGE_OK; - - res_counter_uncharge(&memcg->res, csize); + goto done_restock; + if (!res_counter_charge(&memcg->memsw, size, &fail_res)) + goto done_restock; + res_counter_uncharge(&memcg->res, size); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (batch > nr_pages) { + batch = nr_pages; + goto retry; + } + /* - * Never reclaim on behalf of optional batching, retry with a - * single page instead. + * Unlike in global OOM situations, memcg is not in a physical + * memory shortage. Allow dying and OOM-killed tasks to + * bypass the last charges so that they can exit quickly and + * free their memory. */ - if (nr_pages > min_pages) - return CHARGE_RETRY; + if (unlikely(test_thread_flag(TIF_MEMDIE) || + fatal_signal_pending(current) || + current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; if (!(gfp_mask & __GFP_WAIT)) - return CHARGE_WOULDBLOCK; + goto nomem; - if (gfp_mask & __GFP_NORETRY) - return CHARGE_NOMEM; + nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); + + if (mem_cgroup_margin(mem_over_limit) >= batch) + goto retry; - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if (mem_cgroup_margin(mem_over_limit) >= nr_pages) - return CHARGE_RETRY; + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge @@ -2609,142 +2588,47 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, * unlikely to succeed so close to the limit, and we fall back * to regular pages anyway in case of failure. */ - if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) - return CHARGE_RETRY; - + if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER)) + goto retry; /* * At task move, charge accounts can be doubly counted. So, it's * better to wait until the end of task_move if something is going on. */ if (mem_cgroup_wait_acct_move(mem_over_limit)) - return CHARGE_RETRY; - - if (invoke_oom) - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); - - return CHARGE_NOMEM; -} - -/** - * mem_cgroup_try_charge - try charging a memcg - * @memcg: memcg to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns 0 if @memcg was charged successfully, -EINTR if the charge - * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. - */ -static int mem_cgroup_try_charge(struct mem_cgroup *memcg, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) -{ - unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - int ret; - - if (mem_cgroup_is_root(memcg)) - goto done; - /* - * Unlike in global OOM situations, memcg is not in a physical - * memory shortage. Allow dying and OOM-killed tasks to - * bypass the last charges so that they can exit quickly and - * free their memory. - */ - if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current) || - current->flags & PF_EXITING)) - goto bypass; + goto retry; - if (unlikely(task_in_memcg_oom(current))) - goto nomem; + if (nr_retries--) + goto retry; if (gfp_mask & __GFP_NOFAIL) - oom = false; -again: - if (consume_stock(memcg, nr_pages)) - goto done; - - do { - bool invoke_oom = oom && !nr_oom_retries; - - /* If killed, bypass charge */ - if (fatal_signal_pending(current)) - goto bypass; + goto bypass; - ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, - nr_pages, invoke_oom); - switch (ret) { - case CHARGE_OK: - break; - case CHARGE_RETRY: /* not in OOM situation but retry */ - batch = nr_pages; - goto again; - case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - goto nomem; - case CHARGE_NOMEM: /* OOM routine works */ - if (!oom || invoke_oom) - goto nomem; - nr_oom_retries--; - break; - } - } while (ret != CHARGE_OK); + if (fatal_signal_pending(current)) + goto bypass; - if (batch > nr_pages) - refill_stock(memcg, batch - nr_pages); -done: - return 0; + mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); nomem: if (!(gfp_mask & __GFP_NOFAIL)) return -ENOMEM; bypass: - return -EINTR; -} - -/** - * mem_cgroup_try_charge_mm - try charging a mm - * @mm: mm_struct to charge - * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails - * - * Returns the charged mem_cgroup associated with the given mm_struct or - * NULL the charge failed. - */ -static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - bool oom) + memcg = root_mem_cgroup; + ret = -EINTR; + goto retry; -{ - struct mem_cgroup *memcg; - int ret; - - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - memcg = NULL; - - return memcg; +done_restock: + if (batch > nr_pages) + refill_stock(memcg, batch - nr_pages); +done: + return ret; } -/* - * Somemtimes we have to undo a charge we got by try_charge(). - * This function is for that and do uncharge, put css's refcnt. - * gotten by try_charge(). - */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, - unsigned int nr_pages) +static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(memcg)) { - unsigned long bytes = nr_pages * PAGE_SIZE; + unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&memcg->res, bytes); - if (do_swap_account) - res_counter_uncharge(&memcg->memsw, bytes); - } + res_counter_uncharge(&memcg->res, bytes); + if (do_swap_account) + res_counter_uncharge(&memcg->memsw, bytes); } /* @@ -2756,9 +2640,6 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, { unsigned long bytes = nr_pages * PAGE_SIZE; - if (mem_cgroup_is_root(memcg)) - return; - res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); if (do_swap_account) res_counter_uncharge_until(&memcg->memsw, @@ -2779,6 +2660,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return mem_cgroup_from_id(id); } +/* + * try_get_mem_cgroup_from_page - look up page's memcg association + * @page: the page + * + * Look up, get a css reference, and return the memcg that owns @page. + * + * The page must be locked to prevent racing with swap-in and page + * cache charges. If coming from an unlocked page table, the caller + * must ensure the page is on the LRU or this can race with charging. + */ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { struct mem_cgroup *memcg = NULL; @@ -2789,7 +2680,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) VM_BUG_ON_PAGE(!PageLocked(page), page); pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { memcg = pc->mem_cgroup; if (memcg && !css_tryget_online(&memcg->css)) @@ -2803,23 +2693,17 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) memcg = NULL; rcu_read_unlock(); } - unlock_page_cgroup(pc); return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, - struct page *page, - unsigned int nr_pages, - enum charge_type ctype, - bool lrucare) +static void commit_charge(struct page *page, struct mem_cgroup *memcg, + unsigned int nr_pages, bool lrucare) { struct page_cgroup *pc = lookup_page_cgroup(page); struct zone *uninitialized_var(zone); - struct lruvec *lruvec; bool was_on_lru = false; - bool anon; + struct lruvec *lruvec; - lock_page_cgroup(pc); VM_BUG_ON_PAGE(PageCgroupUsed(pc), page); /* * we don't need page_cgroup_lock about tail pages, becase they are not @@ -2841,16 +2725,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } } - pc->mem_cgroup = memcg; /* - * We access a page_cgroup asynchronously without lock_page_cgroup(). - * Especially when a page_cgroup is taken from a page, pc->mem_cgroup - * is accessed after testing USED bit. To make pc->mem_cgroup visible - * before USED bit, we need memory barrier here. - * See mem_cgroup_add_lru_list(), etc. + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point: + * + * - the page is uncharged + * + * - the page is off-LRU + * + * - an anonymous fault has exclusive page access, except for + * a locked page table + * + * - a page cache insertion, a swapin fault, or a migration + * have the page locked */ - smp_wmb(); - SetPageCgroupUsed(pc); + pc->mem_cgroup = memcg; + pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0); if (lrucare) { if (was_on_lru) { @@ -2862,20 +2752,15 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, spin_unlock_irq(&zone->lru_lock); } - if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) - anon = true; - else - anon = false; - - mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); - unlock_page_cgroup(pc); - + local_irq_disable(); + mem_cgroup_charge_statistics(memcg, page, nr_pages); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ memcg_check_events(memcg, page); + local_irq_enable(); } static DEFINE_MUTEX(set_limit_mutex); @@ -2937,22 +2822,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) if (ret) return ret; - ret = mem_cgroup_try_charge(memcg, gfp, size >> PAGE_SHIFT, - oom_gfp_allowed(gfp)); + ret = try_charge(memcg, gfp, size >> PAGE_SHIFT); if (ret == -EINTR) { /* - * mem_cgroup_try_charge() chosed to bypass to root due to - * OOM kill or fatal signal. Since our only options are to - * either fail the allocation or charge it to this cgroup, do - * it as a temporary condition. But we can't fail. From a - * kmem/slab perspective, the cache has already been selected, - * by mem_cgroup_kmem_get_cache(), so it is too late to change + * try_charge() chose to bypass to root due to OOM kill or + * fatal signal. Since our only options are to either fail + * the allocation or charge it to this cgroup, do it as a + * temporary condition. But we can't fail. From a kmem/slab + * perspective, the cache has already been selected, by + * mem_cgroup_kmem_get_cache(), so it is too late to change * our minds. * * This condition will only trigger if the task entered - * memcg_charge_kmem in a sane state, but was OOM-killed during - * mem_cgroup_try_charge() above. Tasks that were already - * dying when the allocation triggers should have been already + * memcg_charge_kmem in a sane state, but was OOM-killed + * during try_charge() above. Tasks that were already dying + * when the allocation triggers should have been already * directed to the root cgroup in memcontrol.h */ res_counter_charge_nofail(&memcg->res, size, &fail_res); @@ -3463,12 +3347,13 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, memcg_uncharge_kmem(memcg, PAGE_SIZE << order); return; } - + /* + * The page is freshly allocated and not visible to any + * outside callers yet. Set up pc non-atomically. + */ pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); pc->mem_cgroup = memcg; - SetPageCgroupUsed(pc); - unlock_page_cgroup(pc); + pc->flags = PCG_USED; } void __memcg_kmem_uncharge_pages(struct page *page, int order) @@ -3478,19 +3363,11 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) pc = lookup_page_cgroup(page); - /* - * Fast unlocked return. Theoretically might have changed, have to - * check again after locking. - */ if (!PageCgroupUsed(pc)) return; - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); + memcg = pc->mem_cgroup; + pc->flags = 0; /* * We trust that only if there is a memcg associated with the page, it @@ -3510,7 +3387,6 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) /* * Because tail pages are not marked as "used", set it. We're under * zone->lru_lock, 'splitting on pmd' and compound_lock. @@ -3531,8 +3407,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ - pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + pc->flags = head_pc->flags; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], HPAGE_PMD_NR); @@ -3562,7 +3437,6 @@ static int mem_cgroup_move_account(struct page *page, { unsigned long flags; int ret; - bool anon = PageAnon(page); VM_BUG_ON(from == to); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -3576,15 +3450,13 @@ static int mem_cgroup_move_account(struct page *page, if (nr_pages > 1 && !PageTransHuge(page)) goto out; - lock_page_cgroup(pc); - ret = -EINVAL; if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) - goto unlock; + goto out; move_lock_mem_cgroup(from, &flags); - if (!anon && page_mapped(page)) { + if (!PageAnon(page) && page_mapped(page)) { __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], nr_pages); __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], @@ -3598,20 +3470,23 @@ static int mem_cgroup_move_account(struct page *page, nr_pages); } - mem_cgroup_charge_statistics(from, page, anon, -nr_pages); + /* + * It is safe to change pc->mem_cgroup here because the page + * is referenced, charged, and isolated - we can't race with + * uncharging, charging, migration, or LRU putback. + */ /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, page, anon, nr_pages); move_unlock_mem_cgroup(from, &flags); ret = 0; -unlock: - unlock_page_cgroup(pc); - /* - * check events - */ + + local_irq_disable(); + mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); + mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); + local_irq_enable(); out: return ret; } @@ -3682,456 +3557,12 @@ out: return ret; } -int mem_cgroup_charge_anon(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - unsigned int nr_pages = 1; - struct mem_cgroup *memcg; - bool oom = true; - - if (mem_cgroup_disabled()) - return 0; - - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* - * Never OOM-kill a process for a huge page. The - * fault handler will fall back to regular pages. - */ - oom = false; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, nr_pages, - MEM_CGROUP_CHARGE_TYPE_ANON, false); - return 0; -} - -/* - * While swap-in, try_charge -> commit or cancel, the page is locked. - * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is acquired. This refcnt will be consumed by - * "commit()" or removed by "cancel()" - */ -static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - int ret; - - pc = lookup_page_cgroup(page); - /* - * Every swap fault against a single page tries to charge the - * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. The USED bit is protected by - * the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. - */ - if (PageCgroupUsed(pc)) - goto out; - if (do_swap_account) - memcg = try_get_mem_cgroup_from_page(page); - if (!memcg) - memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); - css_put(&memcg->css); - if (ret == -EINTR) - memcg = root_mem_cgroup; - else if (ret) - return ret; -out: - *memcgp = memcg; - return 0; -} - -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, - gfp_t gfp_mask, struct mem_cgroup **memcgp) -{ - if (mem_cgroup_disabled()) { - *memcgp = NULL; - return 0; - } - /* - * A racing thread's fault, or swapoff, may have already - * updated the pte, and even removed page from swap cache: in - * those cases unuse_pte()'s pte_same() test will fail; but - * there's also a KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) { - struct mem_cgroup *memcg; - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - *memcgp = memcg; - return 0; - } - return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); -} - -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); -} - -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, - enum charge_type ctype) -{ - if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - - __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); - /* - * Now swap is on-memory. This means this page may be - * counted both as mem and swap....double count. - * Fix it by uncharging from memsw. Basically, this SwapCache is stable - * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() - * may call delete_from_swap_cache() before reach here. - */ - if (do_swap_account && PageSwapCache(page)) { - swp_entry_t ent = {.val = page_private(page)}; - mem_cgroup_uncharge_swap(ent); - } -} - -void mem_cgroup_commit_charge_swapin(struct page *page, - struct mem_cgroup *memcg) -{ - __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - -int mem_cgroup_charge_file(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - struct mem_cgroup *memcg; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (PageSwapCache(page)) { /* shmem */ - ret = __mem_cgroup_try_charge_swapin(mm, page, - gfp_mask, &memcg); - if (ret) - return ret; - __mem_cgroup_commit_charge_swapin(page, memcg, type); - return 0; - } - - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, 1, true); - if (!memcg) - return -ENOMEM; - __mem_cgroup_commit_charge(memcg, page, 1, type, false); - return 0; -} - -static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, - unsigned int nr_pages, - const enum charge_type ctype) -{ - struct memcg_batch_info *batch = NULL; - bool uncharge_memsw = true; - - /* If swapout, usage of swap doesn't decrease */ - if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - uncharge_memsw = false; - - batch = ¤t->memcg_batch; - /* - * In usual, we do css_get() when we remember memcg pointer. - * But in this case, we keep res->usage until end of a series of - * uncharges. Then, it's ok to ignore memcg's refcnt. - */ - if (!batch->memcg) - batch->memcg = memcg; - /* - * do_batch > 0 when unmapping pages or inode invalidate/truncate. - * In those cases, all pages freed continuously can be expected to be in - * the same cgroup and we have chance to coalesce uncharges. - * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) - * because we want to do uncharge as soon as possible. - */ - - if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) - goto direct_uncharge; - - if (nr_pages > 1) - goto direct_uncharge; - - /* - * In typical case, batch->memcg == mem. This means we can - * merge a series of uncharges to an uncharge of res_counter. - * If not, we uncharge res_counter ony by one. - */ - if (batch->memcg != memcg) - goto direct_uncharge; - /* remember freed charge and uncharge it later */ - batch->nr_pages++; - if (uncharge_memsw) - batch->memsw_nr_pages++; - return; -direct_uncharge: - res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); - if (uncharge_memsw) - res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != memcg)) - memcg_oom_recover(memcg); -} - -/* - * uncharge if !page_mapped(page) - */ -static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, - bool end_migration) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - bool anon; - - if (mem_cgroup_disabled()) - return NULL; - - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - } - /* - * Check if our page_cgroup is valid - */ - pc = lookup_page_cgroup(page); - if (unlikely(!PageCgroupUsed(pc))) - return NULL; - - lock_page_cgroup(pc); - - memcg = pc->mem_cgroup; - - if (!PageCgroupUsed(pc)) - goto unlock_out; - - anon = PageAnon(page); - - switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_ANON: - /* - * Generally PageAnon tells if it's the anon statistics to be - * updated; but sometimes e.g. mem_cgroup_uncharge_page() is - * used before page reached the stage of being marked PageAnon. - */ - anon = true; - /* fallthrough */ - case MEM_CGROUP_CHARGE_TYPE_DROP: - /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page)) - goto unlock_out; - /* - * Pages under migration may not be uncharged. But - * end_migration() /must/ be the one uncharging the - * unused post-migration page and so it has to call - * here with the migration bit still set. See the - * res_counter handling below. - */ - if (!end_migration && PageCgroupMigration(pc)) - goto unlock_out; - break; - case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: - if (!PageAnon(page)) { /* Shared memory */ - if (page->mapping && !page_is_file_cache(page)) - goto unlock_out; - } else if (page_mapped(page)) /* Anon */ - goto unlock_out; - break; - default: - break; - } - - mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); - - ClearPageCgroupUsed(pc); - /* - * pc->mem_cgroup is not cleared here. It will be accessed when it's - * freed from LRU. This is safe because uncharged page is expected not - * to be reused (freed soon). Exception is SwapCache, it's handled by - * special functions. - */ - - unlock_page_cgroup(pc); - /* - * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed, so it's safe to call css_get(). - */ - memcg_check_events(memcg, page); - if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(memcg, true); - css_get(&memcg->css); - } - /* - * Migration does not charge the res_counter for the - * replacement page, so leave it alone when phasing out the - * page that is unused after the migration. - */ - if (!end_migration && !mem_cgroup_is_root(memcg)) - mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - - return memcg; - -unlock_out: - unlock_page_cgroup(pc); - return NULL; -} - -void mem_cgroup_uncharge_page(struct page *page) -{ - /* early check. */ - if (page_mapped(page)) - return; - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - /* - * If the page is in swap cache, uncharge should be deferred - * to the swap path, which also properly accounts swap usage - * and handles memcg lifetime. - * - * Note that this check is not stable and reclaim may add the - * page to swap cache at any time after this. However, if the - * page is not in swap cache by the time page->mapcount hits - * 0, there won't be any page table references to the swap - * slot, and reclaim will free it and not actually write the - * page to disk. - */ - if (PageSwapCache(page)) - return; - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); -} - -void mem_cgroup_uncharge_cache_page(struct page *page) -{ - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping, page); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); -} - -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = ¤t->memcg_batch; - - if (!batch->do_batch) - return; - - batch->do_batch--; - if (batch->do_batch) /* If stacked, do nothing. */ - return; - - if (!batch->memcg) - return; - /* - * This "batch->memcg" is valid without any css_get/put etc... - * bacause we hide charges behind us. - */ - if (batch->nr_pages) - res_counter_uncharge(&batch->memcg->res, - batch->nr_pages * PAGE_SIZE); - if (batch->memsw_nr_pages) - res_counter_uncharge(&batch->memcg->memsw, - batch->memsw_nr_pages * PAGE_SIZE); - memcg_oom_recover(batch->memcg); - /* forget this pointer (for sanity check) */ - batch->memcg = NULL; -} - -#ifdef CONFIG_SWAP -/* - * called after __delete_from_swap_cache() and drop "page" account. - * memcg information is recorded to swap_cgroup of "ent" - */ -void -mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) -{ - struct mem_cgroup *memcg; - int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; - - if (!swapout) /* this was a swap cache but the swap is unused ! */ - ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - - memcg = __mem_cgroup_uncharge_common(page, ctype, false); - - /* - * record memcg information, if swapout && memcg != NULL, - * css_get() was called in uncharge(). - */ - if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); -} -#endif - #ifdef CONFIG_MEMCG_SWAP -/* - * called from swap_entry_free(). remove record in swap_cgroup and - * uncharge "memsw" account. - */ -void mem_cgroup_uncharge_swap(swp_entry_t ent) +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, + bool charge) { - struct mem_cgroup *memcg; - unsigned short id; - - if (!do_swap_account) - return; - - id = swap_cgroup_record(ent, 0); - rcu_read_lock(); - memcg = mem_cgroup_lookup(id); - if (memcg) { - /* - * We uncharge this because swap is freed. This memcg can - * be obsolete one. We avoid calling css_tryget_online(). - */ - if (!mem_cgroup_is_root(memcg)) - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); - mem_cgroup_swap_statistics(memcg, false); - css_put(&memcg->css); - } - rcu_read_unlock(); + int val = (charge) ? 1 : -1; + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } /** @@ -4183,175 +3614,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -/* - * Before starting migration, account PAGE_SIZE to mem_cgroup that the old - * page belongs to. - */ -void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp) -{ - struct mem_cgroup *memcg = NULL; - unsigned int nr_pages = 1; - struct page_cgroup *pc; - enum charge_type ctype; - - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return; - - if (PageTransHuge(page)) - nr_pages <<= compound_order(page); - - pc = lookup_page_cgroup(page); - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - css_get(&memcg->css); - /* - * At migrating an anonymous page, its mapcount goes down - * to 0 and uncharge() will be called. But, even if it's fully - * unmapped, migration may fail and this page has to be - * charged again. We set MIGRATION flag here and delay uncharge - * until end_migration() is called - * - * Corner Case Thinking - * A) - * When the old page was mapped as Anon and it's unmap-and-freed - * while migration was ongoing. - * If unmap finds the old page, uncharge() of it will be delayed - * until end_migration(). If unmap finds a new page, it's - * uncharged when it make mapcount to be 1->0. If unmap code - * finds swap_migration_entry, the new page will not be mapped - * and end_migration() will find it(mapcount==0). - * - * B) - * When the old page was mapped but migraion fails, the kernel - * remaps it. A charge for it is kept by MIGRATION flag even - * if mapcount goes down to 0. We can do remap successfully - * without charging it again. - * - * C) - * The "old" page is under lock_page() until the end of - * migration, so, the old page itself will not be swapped-out. - * If the new page is swapped out before end_migraton, our - * hook to usual swap-out path will catch the event. - */ - if (PageAnon(page)) - SetPageCgroupMigration(pc); - } - unlock_page_cgroup(pc); - /* - * If the page is not charged at this point, - * we return here. - */ - if (!memcg) - return; - - *memcgp = memcg; - /* - * We charge new page before it's used/mapped. So, even if unlock_page() - * is called before end_migration, we can catch all events on this new - * page. In the case new page is migrated but not remapped, new page's - * mapcount will be finally 0 and we call uncharge in end_migration(). - */ - if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - /* - * The page is committed to the memcg, but it's not actually - * charged to the res_counter since we plan on replacing the - * old one and only one page is going to be left afterwards. - */ - __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); -} - -/* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *memcg, - struct page *oldpage, struct page *newpage, bool migration_ok) -{ - struct page *used, *unused; - struct page_cgroup *pc; - bool anon; - - if (!memcg) - return; - - if (!migration_ok) { - used = oldpage; - unused = newpage; - } else { - used = newpage; - unused = oldpage; - } - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); - css_put(&memcg->css); - /* - * We disallowed uncharge of pages under migration because mapcount - * of the page goes down to zero, temporarly. - * Clear the flag and check the page should be charged. - */ - pc = lookup_page_cgroup(oldpage); - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - - /* - * If a page is a file cache, radix-tree replacement is very atomic - * and we can skip this check. When it was an Anon page, its mapcount - * goes down to 0. But because we added MIGRATION flage, it's not - * uncharged yet. There are several case but page->mapcount check - * and USED bit check in mem_cgroup_uncharge_page() will do enough - * check. (see prepare_charge() also) - */ - if (anon) - mem_cgroup_uncharge_page(used); -} - -/* - * At replace page cache, newpage is not under any memcg but it's on - * LRU. So, this function doesn't touch res_counter but handles LRU - * in correct way. Both pages are locked so we cannot race with uncharge. - */ -void mem_cgroup_replace_page_cache(struct page *oldpage, - struct page *newpage) -{ - struct mem_cgroup *memcg = NULL; - struct page_cgroup *pc; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - - if (mem_cgroup_disabled()) - return; - - pc = lookup_page_cgroup(oldpage); - /* fix accounting on old pages */ - lock_page_cgroup(pc); - if (PageCgroupUsed(pc)) { - memcg = pc->mem_cgroup; - mem_cgroup_charge_statistics(memcg, oldpage, false, -1); - ClearPageCgroupUsed(pc); - } - unlock_page_cgroup(pc); - - /* - * When called from shmem_replace_page(), in some cases the - * oldpage has already been charged, and in some cases not. - */ - if (!memcg) - return; - /* - * Even if newpage->mapping was NULL before starting replacement, - * the newpage may be on LRU(or pagevec for LRU) already. We lock - * LRU while we overwrite pc->mem_cgroup. - */ - __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); -} - #ifdef CONFIG_DEBUG_VM static struct page_cgroup *lookup_page_cgroup_used(struct page *page) { @@ -4550,7 +3812,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_mask, &nr_scanned); nr_reclaimed += reclaimed; *total_scanned += nr_scanned; - spin_lock(&mctz->lock); + spin_lock_irq(&mctz->lock); /* * If we failed to reclaim anything from this memory cgroup @@ -4590,7 +3852,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, */ /* If excess == 0, no tree ops */ __mem_cgroup_insert_exceeded(mz, mctz, excess); - spin_unlock(&mctz->lock); + spin_unlock_irq(&mctz->lock); css_put(&mz->memcg->css); loop++; /* @@ -4817,78 +4079,24 @@ out: return retval; } - -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, - enum mem_cgroup_stat_index idx) -{ - struct mem_cgroup *iter; - long val = 0; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, memcg) - val += mem_cgroup_read_stat(iter, idx); - - if (val < 0) /* race ? */ - val = 0; - return val; -} - -static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) -{ - u64 val; - - if (!mem_cgroup_is_root(memcg)) { - if (!swap) - return res_counter_read_u64(&memcg->res, RES_USAGE); - else - return res_counter_read_u64(&memcg->memsw, RES_USAGE); - } - - /* - * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS - * as well as in MEM_CGROUP_STAT_RSS_HUGE. - */ - val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); - - if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); - - return val << PAGE_SHIFT; -} - static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) + struct cftype *cft) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - u64 val; - int name; - enum res_type type; - - type = MEMFILE_TYPE(cft->private); - name = MEMFILE_ATTR(cft->private); + enum res_type type = MEMFILE_TYPE(cft->private); + int name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, false); - else - val = res_counter_read_u64(&memcg->res, name); - break; + return res_counter_read_u64(&memcg->res, name); case _MEMSWAP: - if (name == RES_USAGE) - val = mem_cgroup_usage(memcg, true); - else - val = res_counter_read_u64(&memcg->memsw, name); - break; + return res_counter_read_u64(&memcg->memsw, name); case _KMEM: - val = res_counter_read_u64(&memcg->kmem, name); + return res_counter_read_u64(&memcg->kmem, name); break; default: BUG(); } - - return val; } #ifdef CONFIG_MEMCG_KMEM @@ -5350,7 +4558,10 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) if (!t) goto unlock; - usage = mem_cgroup_usage(memcg, swap); + if (!swap) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* * current_threshold points to threshold just below or equal to usage. @@ -5442,15 +4653,15 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg, mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before adding a new one */ if (thresholds->primary) __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -5530,18 +4741,19 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg, int i, j, size; mutex_lock(&memcg->thresholds_lock); - if (type == _MEM) + + if (type == _MEM) { thresholds = &memcg->thresholds; - else if (type == _MEMSWAP) + usage = res_counter_read_u64(&memcg->res, RES_USAGE); + } else if (type == _MEMSWAP) { thresholds = &memcg->memsw_thresholds; - else + usage = res_counter_read_u64(&memcg->memsw, RES_USAGE); + } else BUG(); if (!thresholds->primary) goto unlock; - usage = mem_cgroup_usage(memcg, type == _MEMSWAP); - /* Check if a threshold crossed before removing */ __mem_cgroup_threshold(memcg, type == _MEMSWAP); @@ -6295,9 +5507,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) * core guarantees its existence. */ } else { - res_counter_init(&memcg->res, NULL); - res_counter_init(&memcg->memsw, NULL); - res_counter_init(&memcg->kmem, NULL); + res_counter_init(&memcg->res, &root_mem_cgroup->res); + res_counter_init(&memcg->memsw, &root_mem_cgroup->memsw); + res_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); /* * Deeper hierachy with use_hierarchy == false doesn't make * much sense so let cgroup subsystem know about this @@ -6431,55 +5643,38 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE 256 static int mem_cgroup_do_precharge(unsigned long count) { - int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; + int ret; - if (mem_cgroup_is_root(memcg)) { + /* Try a single bulk charge without reclaim first */ + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; - /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct res_counter *dummy; - /* - * "memcg" cannot be under rmdir() because we've already checked - * by cgroup_lock_live_cgroup() that it is not removed and we - * are still under the same cgroup_mutex. So we can postpone - * css_get(). - */ - if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) - goto one_by_one; - if (do_swap_account && res_counter_charge(&memcg->memsw, - PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&memcg->res, PAGE_SIZE * count); - goto one_by_one; - } - mc.precharge += count; + if (ret == -EINTR) { + cancel_charge(root_mem_cgroup, count); return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, GFP_KERNEL, 1, false); + ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* + * In case of failure, any residual charges against + * mc.to will be dropped by mem_cgroup_clear_mc() + * later on. However, cancel any charges that are + * bypassed to root right away or they'll be lost. + */ + if (ret == -EINTR) + cancel_charge(root_mem_cgroup, 1); if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** @@ -6615,9 +5810,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, if (page) { pc = lookup_page_cgroup(page); /* - * Do only loose check w/o page_cgroup lock. - * mem_cgroup_move_account() checks the pc is valid or not under - * the lock. + * Do only loose check w/o serialization. + * mem_cgroup_move_account() checks the pc is valid or + * not under LRU exclusion. */ if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { ret = MC_TARGET_PAGE; @@ -6742,7 +5937,7 @@ static void __mem_cgroup_clear_mc(void) /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { - __mem_cgroup_cancel_charge(mc.to, mc.precharge); + cancel_charge(mc.to, mc.precharge); mc.precharge = 0; } /* @@ -6750,27 +5945,24 @@ static void __mem_cgroup_clear_mc(void) * we must uncharge here. */ if (mc.moved_charge) { - __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + cancel_charge(mc.from, mc.moved_charge); mc.moved_charge = 0; } /* we must fixup refcnts and charges */ if (mc.moved_swap) { /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - res_counter_uncharge(&mc.from->memsw, - PAGE_SIZE * mc.moved_swap); + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); for (i = 0; i < mc.moved_swap; i++) css_put(&mc.from->css); - if (!mem_cgroup_is_root(mc.to)) { - /* - * we charged both to->res and to->memsw, so we should - * uncharge to->res. - */ - res_counter_uncharge(&mc.to->res, - PAGE_SIZE * mc.moved_swap); - } + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } @@ -7082,6 +6274,380 @@ static void __init enable_swap_cgroup(void) } #endif +#ifdef CONFIG_MEMCG_SWAP +/** + * mem_cgroup_swapout - transfer a memsw charge to swap + * @page: page whose memsw charge to transfer + * @entry: swap entry to move the charge to + * + * Transfer the memsw charge of @page to @entry. + */ +void mem_cgroup_swapout(struct page *page, swp_entry_t entry) +{ + struct page_cgroup *pc; + unsigned short oldid; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + if (!do_swap_account) + return; + + pc = lookup_page_cgroup(page); + + /* Readahead page, never charged */ + if (!PageCgroupUsed(pc)) + return; + + VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page); + + oldid = swap_cgroup_record(entry, mem_cgroup_id(pc->mem_cgroup)); + VM_BUG_ON_PAGE(oldid, page); + + pc->flags &= ~PCG_MEMSW; + css_get(&pc->mem_cgroup->css); + mem_cgroup_swap_statistics(pc->mem_cgroup, true); +} + +/** + * mem_cgroup_uncharge_swap - uncharge a swap entry + * @entry: swap entry to uncharge + * + * Drop the memsw charge associated with @entry. + */ +void mem_cgroup_uncharge_swap(swp_entry_t entry) +{ + struct mem_cgroup *memcg; + unsigned short id; + + if (!do_swap_account) + return; + + id = swap_cgroup_record(entry, 0); + rcu_read_lock(); + memcg = mem_cgroup_lookup(id); + if (memcg) { + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); + css_put(&memcg->css); + } + rcu_read_unlock(); +} +#endif + +/** + * mem_cgroup_try_charge - try charging a page + * @page: page to charge + * @mm: mm context of the victim + * @gfp_mask: reclaim mode + * @memcgp: charged memcg return + * + * Try to charge @page to the memcg that @mm belongs to, reclaiming + * pages according to @gfp_mask if necessary. + * + * Returns 0 on success, with *@memcgp pointing to the charged memcg. + * Otherwise, an error code is returned. + * + * After page->mapping has been set up, the caller must finalize the + * charge with mem_cgroup_commit_charge(). Or abort the transaction + * with mem_cgroup_cancel_charge() in case page instantiation fails. + */ +int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + struct mem_cgroup *memcg = NULL; + unsigned int nr_pages = 1; + int ret = 0; + + if (mem_cgroup_disabled()) + goto out; + + if (PageSwapCache(page)) { + struct page_cgroup *pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + goto out; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + if (do_swap_account && PageSwapCache(page)) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) + memcg = get_mem_cgroup_from_mm(mm); + + ret = try_charge(memcg, gfp_mask, nr_pages); + + css_put(&memcg->css); + + if (ret == -EINTR) { + memcg = root_mem_cgroup; + ret = 0; + } +out: + *memcgp = memcg; + return ret; +} + +/** + * mem_cgroup_commit_charge - commit a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * @lrucare: page might be on LRU already + * + * Finalize a charge transaction started by mem_cgroup_try_charge(), + * after page->mapping has been set up. This must happen atomically + * as part of the page instantiation, i.e. under the page table lock + * for anonymous pages, under the page lock for page and swap cache. + * + * In addition, the page must not be on the LRU during the commit, to + * prevent racing with task migration. If it might be, use @lrucare. + * + * Use mem_cgroup_cancel_charge() to cancel the transaction instead. + */ +void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, + bool lrucare) +{ + unsigned int nr_pages = 1; + + VM_BUG_ON_PAGE(!page->mapping, page); + VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page); + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + commit_charge(page, memcg, nr_pages, lrucare); + + if (do_swap_account && PageSwapCache(page)) { + swp_entry_t entry = { .val = page_private(page) }; + /* + * The swap entry might not get freed for a long time, + * let's not wait for it. The page already received a + * memory+swap charge, drop the swap entry duplicate. + */ + mem_cgroup_uncharge_swap(entry); + } +} + +/** + * mem_cgroup_cancel_charge - cancel a page charge + * @page: page to charge + * @memcg: memcg to charge the page to + * + * Cancel a charge transaction started by mem_cgroup_try_charge(). + */ +void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) +{ + unsigned int nr_pages = 1; + + if (mem_cgroup_disabled()) + return; + /* + * Swap faults will attempt to charge the same page multiple + * times. But reuse_swap_page() might have removed the page + * from swapcache already, so we can't check PageSwapCache(). + */ + if (!memcg) + return; + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + } + + cancel_charge(memcg, nr_pages); +} + +static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, + unsigned long nr_mem, unsigned long nr_memsw, + unsigned long nr_anon, unsigned long nr_file, + unsigned long nr_huge, struct page *dummy_page) +{ + unsigned long flags; + + if (nr_mem) + res_counter_uncharge(&memcg->res, nr_mem * PAGE_SIZE); + if (nr_memsw) + res_counter_uncharge(&memcg->memsw, nr_memsw * PAGE_SIZE); + + memcg_oom_recover(memcg); + + local_irq_save(flags); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file); + memcg_check_events(memcg, dummy_page); + local_irq_restore(flags); +} + +static void uncharge_list(struct list_head *page_list) +{ + struct mem_cgroup *memcg = NULL; + unsigned long nr_memsw = 0; + unsigned long nr_anon = 0; + unsigned long nr_file = 0; + unsigned long nr_huge = 0; + unsigned long pgpgout = 0; + unsigned long nr_mem = 0; + struct list_head *next; + struct page *page; + + next = page_list->next; + do { + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + page = list_entry(next, struct page, lru); + next = page->lru.next; + + VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_PAGE(page_count(page), page); + + pc = lookup_page_cgroup(page); + if (!PageCgroupUsed(pc)) + continue; + + /* + * Nobody should be changing or seriously looking at + * pc->mem_cgroup and pc->flags at this point, we have + * fully exclusive access to the page. + */ + + if (memcg != pc->mem_cgroup) { + if (memcg) { + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); + pgpgout = nr_mem = nr_memsw = 0; + nr_anon = nr_file = nr_huge = 0; + } + memcg = pc->mem_cgroup; + } + + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + + if (pc->flags & PCG_MEM) + nr_mem += nr_pages; + if (pc->flags & PCG_MEMSW) + nr_memsw += nr_pages; + pc->flags = 0; + + pgpgout++; + } while (next != page_list); + + if (memcg) + uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, + nr_anon, nr_file, nr_huge, page); +} + +/** + * mem_cgroup_uncharge - uncharge a page + * @page: page to uncharge + * + * Uncharge a page previously charged with mem_cgroup_try_charge() and + * mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge(struct page *page) +{ + if (mem_cgroup_disabled()) + return; + + INIT_LIST_HEAD(&page->lru); + uncharge_list(&page->lru); +} + +/** + * mem_cgroup_uncharge_list - uncharge a list of page + * @page_list: list of pages to uncharge + * + * Uncharge a list of pages previously charged with + * mem_cgroup_try_charge() and mem_cgroup_commit_charge(). + */ +void mem_cgroup_uncharge_list(struct list_head *page_list) +{ + if (mem_cgroup_disabled()) + return; + + if (!list_empty(page_list)) + uncharge_list(page_list); +} + +/** + * mem_cgroup_migrate - migrate a charge to another page + * @oldpage: currently charged page + * @newpage: page to transfer the charge to + * @lrucare: page might be on LRU already + * + * Migrate the charge from @oldpage to @newpage. + * + * Both pages must be locked, @newpage->mapping must be set up. + */ +void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, + bool lrucare) +{ + unsigned int nr_pages = 1; + struct page_cgroup *pc; + + VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); + VM_BUG_ON_PAGE(PageLRU(oldpage), oldpage); + VM_BUG_ON_PAGE(PageLRU(newpage), newpage); + VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage); + + if (mem_cgroup_disabled()) + return; + + pc = lookup_page_cgroup(oldpage); + if (!PageCgroupUsed(pc)) + return; + + /* Already migrated */ + if (!(pc->flags & PCG_MEM)) + return; + + VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage); + pc->flags &= ~(PCG_MEM | PCG_MEMSW); + + if (PageTransHuge(oldpage)) { + nr_pages <<= compound_order(oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage); + VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage); + } + + commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare); +} + /* * subsys_initcall() for memory controller. * diff --git a/mm/memory-failure.c b/mm/memory-failure.c index c6399e328931..e3e2f007946e 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; @@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, mutex_lock(&mapping->i_mmap_mutex); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); struct task_struct *t = task_early_kill(tsk, force_early); if (!t) @@ -1165,6 +1165,16 @@ int memory_failure(unsigned long pfn, int trapno, int flags) lock_page(hpage); /* + * The page could have changed compound pages during the locking. + * If this happens just bail out. + */ + if (compound_head(p) != hpage) { + action_result(pfn, "different compound page after locking", IGNORED); + res = -EBUSY; + goto out; + } + + /* * We use page flags to determine what action should be taken, but * the flags can be modified by the error containment action. One * example is an mlocked page, where PG_mlocked is cleared by diff --git a/mm/memory.c b/mm/memory.c index d67fd9fcf1f2..eb37dfb8f631 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -884,7 +884,7 @@ out_set_pte: return 0; } -int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { @@ -1292,7 +1292,6 @@ static void unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); - mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1302,7 +1301,6 @@ static void unmap_page_range(struct mmu_gather *tlb, next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); - mem_cgroup_uncharge_end(); } @@ -2049,6 +2047,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *dirty_page = NULL; unsigned long mmun_start = 0; /* For mmu_notifiers */ unsigned long mmun_end = 0; /* For mmu_notifiers */ + struct mem_cgroup *memcg; old_page = vm_normal_page(vma, address, orig_pte); if (!old_page) { @@ -2204,7 +2203,7 @@ gotten: } __SetPageUptodate(new_page); - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) goto oom_free_new; mmun_start = address & PAGE_MASK; @@ -2234,6 +2233,8 @@ gotten: */ ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the @@ -2271,7 +2272,7 @@ gotten: new_page = old_page; ret |= VM_FAULT_WRITE; } else - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); if (new_page) page_cache_release(new_page); @@ -2399,7 +2400,10 @@ EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * We return with the mmap_sem locked or unlocked in the same cases + * as does filemap_fault(). */ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -2407,10 +2411,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, { spinlock_t *ptl; struct page *page, *swapcache; + struct mem_cgroup *memcg; swp_entry_t entry; pte_t pte; int locked; - struct mem_cgroup *ptr; int exclusive = 0; int ret = 0; @@ -2486,7 +2490,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page; } - if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) { ret = VM_FAULT_OOM; goto out_page; } @@ -2511,10 +2515,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. - * Because delete_from_swap_page() may be called by reuse_swap_page(), - * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry - * in page->private. In this case, a record in swap_cgroup is silently - * discarded at swap_free(). */ inc_mm_counter_fast(mm, MM_ANONPAGES); @@ -2530,12 +2530,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); - if (page == swapcache) + if (page == swapcache) { do_page_add_anon_rmap(page, vma, address, exclusive); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, address); - /* It's better to call commit-charge after rmap is established */ - mem_cgroup_commit_charge_swapin(page, ptr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) @@ -2568,7 +2570,7 @@ unlock: out: return ret; out_nomap: - mem_cgroup_cancel_charge_swapin(ptr); + mem_cgroup_cancel_charge(page, memcg); pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); @@ -2624,6 +2626,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags) { + struct mem_cgroup *memcg; struct page *page; spinlock_t *ptl; pte_t entry; @@ -2657,7 +2660,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, */ __SetPageUptodate(page); - if (mem_cgroup_charge_anon(page, mm, GFP_KERNEL)) + if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); @@ -2670,6 +2673,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); setpte: set_pte_at(mm, address, page_table, entry); @@ -2679,7 +2684,7 @@ unlock: pte_unmap_unlock(page_table, ptl); return 0; release: - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); page_cache_release(page); goto unlock; oom_free_page: @@ -2688,6 +2693,11 @@ oom: return VM_FAULT_OOM; } +/* + * The mmap_sem must have been held on entry, and may have been + * released depending on flags and vma->vm_ops->fault() return value. + * See filemap_fault() and __lock_page_retry(). + */ static int __do_fault(struct vm_area_struct *vma, unsigned long address, pgoff_t pgoff, unsigned int flags, struct page **page) { @@ -2744,7 +2754,7 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); else if (pte_file(*pte) && pte_file_soft_dirty(*pte)) - pte_mksoft_dirty(entry); + entry = pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); @@ -2882,7 +2892,8 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages && fault_around_pages() > 1) { + if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) && + fault_around_pages() > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) @@ -2913,6 +2924,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, pgoff_t pgoff, unsigned int flags, pte_t orig_pte) { struct page *fault_page, *new_page; + struct mem_cgroup *memcg; spinlock_t *ptl; pte_t *pte; int ret; @@ -2924,7 +2936,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (!new_page) return VM_FAULT_OOM; - if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL)) { + if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) { page_cache_release(new_page); return VM_FAULT_OOM; } @@ -2944,12 +2956,14 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto uncharge_out; } do_set_pte(vma, address, new_page, pte, true, true); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); unlock_page(fault_page); page_cache_release(fault_page); return ret; uncharge_out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); page_cache_release(new_page); return ret; } @@ -2965,6 +2979,8 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, int dirtied = 0; int ret, tmp; + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + ret = __do_fault(vma, address, pgoff, flags, &fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -2995,6 +3011,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (set_page_dirty(fault_page)) dirtied = 1; + /* + * Take a local copy of the address_space - page.mapping may be zeroed + * by truncate after unlock_page(). The address_space itself remains + * pinned by vma->vm_file's reference. We rely on unlock_page()'s + * release semantics to prevent the compiler from undoing this copying. + */ mapping = fault_page->mapping; unlock_page(fault_page); if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) { @@ -3012,6 +3034,12 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } +/* + * We enter with non-exclusive mmap_sem (to exclude vma changes, + * but allow concurrent faults). + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, unsigned int flags, pte_t orig_pte) @@ -3036,7 +3064,9 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, @@ -3168,7 +3198,10 @@ out: * * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. - * We return with mmap_sem still held, but pte unmapped and unlocked. + * We return with pte unmapped and unlocked. + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int handle_pte_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, @@ -3177,7 +3210,7 @@ static int handle_pte_fault(struct mm_struct *mm, pte_t entry; spinlock_t *ptl; - entry = *pte; + entry = ACCESS_ONCE(*pte); if (!pte_present(entry)) { if (pte_none(entry)) { if (vma->vm_ops) { @@ -3228,6 +3261,9 @@ unlock: /* * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). */ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -3309,6 +3345,12 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, return handle_pte_fault(mm, vma, address, pte, pmd, flags); } +/* + * By the time we get here, we already hold the mm semaphore + * + * The mmap_sem may have been released depending on flags and our + * return value. See filemap_fault() and __lock_page_or_retry(). + */ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, unsigned int flags) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 469bbf505f85..a3797d3fd8a4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -284,8 +284,8 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) } #endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */ -static void grow_zone_span(struct zone *zone, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_zone_end_pfn; @@ -427,8 +427,8 @@ out_fail: return -1; } -static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, - unsigned long end_pfn) +static void __meminit grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, + unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); @@ -977,15 +977,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ zone = page_zone(pfn_to_page(pfn)); ret = -EINVAL; - if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + if ((zone_idx(zone) > ZONE_NORMAL || + online_type == MMOP_ONLINE_MOVABLE) && !can_online_high_movable(zone)) goto out; - if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (online_type == MMOP_ONLINE_KERNEL && + zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) goto out; } - if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (online_type == MMOP_ONLINE_MOVABLE && + zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) goto out; } diff --git a/mm/migrate.c b/mm/migrate.c index 9e0beaa91845..7f5a42403fae 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -778,11 +778,14 @@ static int move_to_new_page(struct page *newpage, struct page *page, rc = fallback_migrate_page(mapping, newpage, page, mode); if (rc != MIGRATEPAGE_SUCCESS) { - newpage->mapping = NULL; + if (!PageAnon(newpage)) + newpage->mapping = NULL; } else { + mem_cgroup_migrate(page, newpage, false); if (remap_swapcache) remove_migration_ptes(page, newpage); - page->mapping = NULL; + if (!PageAnon(page)) + page->mapping = NULL; } unlock_page(newpage); @@ -795,7 +798,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, { int rc = -EAGAIN; int remap_swapcache = 1; - struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; if (!trylock_page(page)) { @@ -821,9 +823,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, lock_page(page); } - /* charge against new page */ - mem_cgroup_prepare_migration(page, newpage, &mem); - if (PageWriteback(page)) { /* * Only in the case of a full synchronous migration is it @@ -833,10 +832,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ if (mode != MIGRATE_SYNC) { rc = -EBUSY; - goto uncharge; + goto out_unlock; } if (!force) - goto uncharge; + goto out_unlock; wait_on_page_writeback(page); } /* @@ -872,7 +871,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, */ remap_swapcache = 0; } else { - goto uncharge; + goto out_unlock; } } @@ -885,7 +884,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * the page migration right away (proteced by page lock). */ rc = balloon_page_migrate(newpage, page, mode); - goto uncharge; + goto out_unlock; } /* @@ -904,7 +903,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { try_to_free_buffers(page); - goto uncharge; + goto out_unlock; } goto skip_unmap; } @@ -923,10 +922,7 @@ skip_unmap: if (anon_vma) put_anon_vma(anon_vma); -uncharge: - mem_cgroup_end_migration(mem, page, newpage, - (rc == MIGRATEPAGE_SUCCESS || - rc == MIGRATEPAGE_BALLOON_SUCCESS)); +out_unlock: unlock_page(page); out: return rc; @@ -1785,7 +1781,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; struct page *new_page = NULL; - struct mem_cgroup *memcg = NULL; int page_lru = page_is_file_cache(page); unsigned long mmun_start = address & HPAGE_PMD_MASK; unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE; @@ -1851,15 +1846,6 @@ fail_putback: goto out_unlock; } - /* - * Traditional migration needs to prepare the memcg charge - * transaction early to prevent the old page from being - * uncharged when installing migration entries. Here we can - * save the potential rollback and start the charge transfer - * only when migration is already known to end successfully. - */ - mem_cgroup_prepare_migration(page, new_page, &memcg); - orig_entry = *pmd; entry = mk_pmd(new_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); @@ -1887,14 +1873,10 @@ fail_putback: goto fail_putback; } + mem_cgroup_migrate(page, new_page, false); + page_remove_rmap(page); - /* - * Finish the charge transaction under the page table lock to - * prevent split_huge_page() from dividing up the charge - * before it's fully transferred to the new page. - */ - mem_cgroup_end_migration(memcg, page, new_page, true); spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); diff --git a/mm/mlock.c b/mm/mlock.c index b1eb53634005..ce84cb0b83ef 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -210,12 +210,19 @@ out: * @vma: target vma * @start: start address * @end: end address + * @nonblocking: * * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * - * vma->vm_mm->mmap_sem must be held for at least read. + * vma->vm_mm->mmap_sem must be held. + * + * If @nonblocking is NULL, it may be held for read or write and will + * be unperturbed. + * + * If @nonblocking is non-NULL, it must held for read only and may be + * released. If it's released, *@nonblocking will be set to 0. */ long __mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, int *nonblocking) diff --git a/mm/mmap.c b/mm/mmap.c index 129b847d30cc..64c9d736155c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -31,6 +31,7 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> +#include <linux/mmdebug.h> #include <linux/perf_event.h> #include <linux/audit.h> #include <linux/khugepaged.h> @@ -134,6 +135,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed, reserve; + VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) < + -(s64)vm_committed_as_batch * num_online_cpus(), + "memory commitment underflow"); + vm_acct_memory(pages); /* diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 518e2c3f4c75..e0c943014eb7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1306,9 +1306,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi, *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); if (bdi_bg_thresh) - *bdi_bg_thresh = div_u64((u64)*bdi_thresh * - background_thresh, - dirty_thresh); + *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh * + background_thresh, + dirty_thresh) : 0; /* * In order to avoid the stacked BDI deadlock we need diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ea758b898fd..b99643d42a93 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -680,9 +680,12 @@ static void free_pcppages_bulk(struct zone *zone, int count, int migratetype = 0; int batch_free = 0; int to_free = count; + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); while (to_free) { struct page *page; @@ -731,8 +734,11 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { + unsigned long nr_scanned; spin_lock(&zone->lock); - zone->pages_scanned = 0; + nr_scanned = zone_page_state(zone, NR_PAGES_SCANNED); + if (nr_scanned) + __mod_zone_page_state(zone, NR_PAGES_SCANNED, -nr_scanned); __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) @@ -1257,15 +1263,11 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; - int to_drain; - unsigned long batch; + int to_drain, batch; local_irq_save(flags); batch = ACCESS_ONCE(pcp->batch); - if (pcp->count >= batch) - to_drain = batch; - else - to_drain = pcp->count; + to_drain = min(pcp->count, batch); if (to_drain > 0) { free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; @@ -1610,6 +1612,9 @@ again: } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); + if (zone_page_state(zone, NR_ALLOC_BATCH) == 0 && + !zone_is_fair_depleted(zone)) + zone_set_flag(zone, ZONE_FAIR_DEPLETED); __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone, gfp_flags); @@ -1712,7 +1717,6 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, { /* free_pages my go negative - that's OK */ long min = mark; - long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; long free_cma = 0; @@ -1727,7 +1731,7 @@ static bool __zone_watermark_ok(struct zone *z, unsigned int order, free_cma = zone_page_state(z, NR_FREE_CMA_PAGES); #endif - if (free_pages - free_cma <= min + lowmem_reserve) + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx]) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ @@ -1922,6 +1926,18 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ +static void reset_alloc_batches(struct zone *preferred_zone) +{ + struct zone *zone = preferred_zone->zone_pgdat->node_zones; + + do { + mod_zone_page_state(zone, NR_ALLOC_BATCH, + high_wmark_pages(zone) - low_wmark_pages(zone) - + atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); + zone_clear_flag(zone, ZONE_FAIR_DEPLETED); + } while (zone++ != preferred_zone); +} + /* * get_page_from_freelist goes through the zonelist trying to allocate * a page. @@ -1939,8 +1955,12 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int did_zlc_setup = 0; /* just call zlc_setup() one time */ bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); + int nr_fair_skipped = 0; + bool zonelist_rescan; zonelist_scan: + zonelist_rescan = false; + /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. @@ -1964,9 +1984,11 @@ zonelist_scan: */ if (alloc_flags & ALLOC_FAIR) { if (!zone_local(preferred_zone, zone)) + break; + if (zone_is_fair_depleted(zone)) { + nr_fair_skipped++; continue; - if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) - continue; + } } /* * When allocating a page cache page for writing, we @@ -2072,13 +2094,7 @@ this_zone_full: zlc_mark_zone_full(zonelist, z); } - if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { - /* Disable zlc cache for second zonelist scan */ - zlc_active = 0; - goto zonelist_scan; - } - - if (page) + if (page) { /* * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was * necessary to allocate the page. The expectation is @@ -2087,8 +2103,37 @@ this_zone_full: * for !PFMEMALLOC purposes. */ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); + return page; + } - return page; + /* + * The first pass makes sure allocations are spread fairly within the + * local node. However, the local node might have free pages left + * after the fairness batches are exhausted, and remote zones haven't + * even been considered yet. Try once more without fairness, and + * include remote zones now, before entering the slowpath and waking + * kswapd: prefer spilling to a remote zone over swapping locally. + */ + if (alloc_flags & ALLOC_FAIR) { + alloc_flags &= ~ALLOC_FAIR; + if (nr_fair_skipped) { + zonelist_rescan = true; + reset_alloc_batches(preferred_zone); + } + if (nr_online_nodes > 1) + zonelist_rescan = true; + } + + if (unlikely(IS_ENABLED(CONFIG_NUMA) && zlc_active)) { + /* Disable zlc cache for second zonelist scan */ + zlc_active = 0; + zonelist_rescan = true; + } + + if (zonelist_rescan) + goto zonelist_scan; + + return NULL; } /* @@ -2409,28 +2454,6 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, return page; } -static void reset_alloc_batches(struct zonelist *zonelist, - enum zone_type high_zoneidx, - struct zone *preferred_zone) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - /* - * Only reset the batches of zones that were actually - * considered in the fairness pass, we don't want to - * trash fairness information for zones that are not - * actually part of this zonelist's round-robin cycle. - */ - if (!zone_local(preferred_zone, zone)) - continue; - mod_zone_page_state(zone, NR_ALLOC_BATCH, - high_wmark_pages(zone) - low_wmark_pages(zone) - - atomic_long_read(&zone->vm_stat[NR_ALLOC_BATCH])); - } -} - static void wake_all_kswapds(unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2766,29 +2789,12 @@ retry_cpuset: if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; #endif -retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* - * The first pass makes sure allocations are spread - * fairly within the local node. However, the local - * node might have free pages left after the fairness - * batches are exhausted, and remote zones haven't - * even been considered yet. Try once more without - * fairness, and include remote zones now, before - * entering the slowpath and waking kswapd: prefer - * spilling to a remote zone over swapping locally. - */ - if (alloc_flags & ALLOC_FAIR) { - reset_alloc_batches(zonelist, high_zoneidx, - preferred_zone); - alloc_flags &= ~ALLOC_FAIR; - goto retry; - } - /* * Runtime PM, block IO and its error handling path * can deadlock because I/O on the device might not * complete. @@ -2962,7 +2968,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * Note this is not alloc_pages_exact_node() which allocates on a specific node, * but is not exact. */ -void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) { unsigned order = get_order(size); struct page *p = alloc_pages_node(nid, gfp_mask, order); @@ -2970,7 +2976,6 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); } -EXPORT_SYMBOL(alloc_pages_exact_nid); /** * free_pages_exact - release memory allocated via alloc_pages_exact() @@ -3052,7 +3057,7 @@ static inline void show_node(struct zone *zone) void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; - val->sharedram = 0; + val->sharedram = global_page_state(NR_SHMEM); val->freeram = global_page_state(NR_FREE_PAGES); val->bufferram = nr_blockdev_pages(); val->totalhigh = totalhigh_pages; @@ -3072,6 +3077,7 @@ void si_meminfo_node(struct sysinfo *val, int nid) for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) managed_pages += pgdat->node_zones[zone_type].managed_pages; val->totalram = managed_pages; + val->sharedram = node_page_state(nid, NR_SHMEM); val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3253,12 +3259,12 @@ void show_free_areas(unsigned int filter) K(zone_page_state(zone, NR_BOUNCE)), K(zone_page_state(zone, NR_FREE_CMA_PAGES)), K(zone_page_state(zone, NR_WRITEBACK_TEMP)), - zone->pages_scanned, + K(zone_page_state(zone, NR_PAGES_SCANNED)), (!zone_reclaimable(zone) ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) - printk(" %lu", zone->lowmem_reserve[i]); + printk(" %ld", zone->lowmem_reserve[i]); printk("\n"); } @@ -4969,6 +4975,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_start_pfn = node_start_pfn; #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); + printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid, + (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1); #endif calculate_node_totalpages(pgdat, start_pfn, end_pfn, zones_size, zholes_size); @@ -5579,7 +5587,7 @@ static void calculate_totalreserve_pages(void) for_each_online_pgdat(pgdat) { for (i = 0; i < MAX_NR_ZONES; i++) { struct zone *zone = pgdat->node_zones + i; - unsigned long max = 0; + long max = 0; /* Find valid and maximum lowmem_reserve in the zone */ for (j = i; j < MAX_NR_ZONES; j++) { @@ -6062,10 +6070,11 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) } /** - * get_pageblock_flags_group - Return the requested group of flags for the pageblock_nr_pages block of pages + * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest to retrieve + * @pfn: Page Number of the page * @end_bitidx: The last bit of interest + * @mask: The mask for flags * returns pageblock_bits flags */ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, @@ -6091,9 +6100,10 @@ unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, /** * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest - * @start_bitidx: The first bit of interest - * @end_bitidx: The last bit of interest * @flags: The flags to set + * @pfn: Page Number of the page + * @end_bitidx: The last bit of interest + * @mask: The mask for flags */ void set_pfnblock_flags_mask(struct page *page, unsigned long flags, unsigned long pfn, diff --git a/mm/readahead.c b/mm/readahead.c index 0ca36a7770b1..17b9172ec37f 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -326,7 +326,6 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra, * - thrashing threshold in memory tight systems */ static pgoff_t count_history_pages(struct address_space *mapping, - struct file_ra_state *ra, pgoff_t offset, unsigned long max) { pgoff_t head; @@ -349,7 +348,7 @@ static int try_context_readahead(struct address_space *mapping, { pgoff_t size; - size = count_history_pages(mapping, ra, offset, max); + size = count_history_pages(mapping, offset, max); /* * not enough history pages: diff --git a/mm/rmap.c b/mm/rmap.c index b7e94ebbd09e..3e8491c504f8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -517,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma) static inline unsigned long __vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (unlikely(is_vm_hugetlb_page(vma))) - pgoff = page->index << huge_page_order(page_hstate(page)); - + pgoff_t pgoff = page_to_pgoff(page); return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); } @@ -1036,25 +1032,6 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - - VM_BUG_ON_PAGE(PageLRU(page), page); - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { - SetPageActive(page); - lru_cache_add(page); - return; - } - - if (!TestSetPageMlocked(page)) { - /* - * We use the irq-unsafe __mod_zone_page_stat because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - add_page_to_unevictable_list(page); } /** @@ -1112,7 +1089,6 @@ void page_remove_rmap(struct page *page) if (unlikely(PageHuge(page))) goto out; if (anon) { - mem_cgroup_uncharge_page(page); if (PageTransHuge(page)) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1639,7 +1615,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); struct anon_vma_chain *avc; int ret = SWAP_AGAIN; @@ -1680,7 +1656,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc) static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc) { struct address_space *mapping = page->mapping; - pgoff_t pgoff = page->index << compound_order(page); + pgoff_t pgoff = page_to_pgoff(page); struct vm_area_struct *vma; int ret = SWAP_AGAIN; diff --git a/mm/shmem.c b/mm/shmem.c index 1140f49b6ded..b16d3e7e8d9a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt; * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { - int mode; /* FALLOC_FL mode currently operating */ + wait_queue_head_t *waitq; /* faults into hole wait for punch to end */ pgoff_t start; /* start of range currently being fallocated */ pgoff_t next; /* the next page offset to be fallocated */ pgoff_t nr_falloced; /* how many new pages have been fallocated */ @@ -149,6 +149,19 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size) vm_unacct_memory(VM_ACCT(size)); } +static inline int shmem_reacct_size(unsigned long flags, + loff_t oldsize, loff_t newsize) +{ + if (!(flags & VM_NORESERVE)) { + if (VM_ACCT(newsize) > VM_ACCT(oldsize)) + return security_vm_enough_memory_mm(current->mm, + VM_ACCT(newsize) - VM_ACCT(oldsize)); + else if (VM_ACCT(newsize) < VM_ACCT(oldsize)) + vm_unacct_memory(VM_ACCT(oldsize) - VM_ACCT(newsize)); + } + return 0; +} + /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow huge sparse files. @@ -280,7 +293,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, */ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, - pgoff_t index, gfp_t gfp, void *expected) + pgoff_t index, void *expected) { int error; @@ -406,7 +419,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, pvec.pages, indices); if (!pvec.nr) break; - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -434,7 +446,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -468,24 +479,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, return; index = start; - for ( ; ; ) { + while (index < end) { cond_resched(); pvec.nr = find_get_entries(mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), pvec.pages, indices); if (!pvec.nr) { - if (index == start || unfalloc) + /* If all gone or hole-punch or unfalloc, we're done */ + if (index == start || end != -1) break; + /* But if truncating, restart to make sure all gone */ index = start; continue; } - if ((index == start || unfalloc) && indices[0] >= end) { - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - break; - } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -496,8 +503,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (radix_tree_exceptional_entry(page)) { if (unfalloc) continue; - nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + if (shmem_free_swap(mapping, index, page)) { + /* Swap was replaced by page: retry */ + index--; + break; + } + nr_swaps_freed++; continue; } @@ -506,13 +517,17 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (page->mapping == mapping) { VM_BUG_ON_PAGE(PageWriteback(page), page); truncate_inode_page(mapping, page); + } else { + /* Page was replaced by swap: retry */ + unlock_page(page); + index--; + break; } } unlock_page(page); } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } @@ -543,6 +558,10 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t newsize = attr->ia_size; if (newsize != oldsize) { + error = shmem_reacct_size(SHMEM_I(inode)->flags, + oldsize, newsize); + if (error) + return error; i_size_write(inode, newsize); inode->i_ctime = inode->i_mtime = CURRENT_TIME; } @@ -598,7 +617,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, radswap = swp_to_radix_entry(swap); index = radix_tree_locate_item(&mapping->page_tree, radswap); if (index == -1) - return 0; + return -EAGAIN; /* tell shmem_unuse we found nothing */ /* * Move _head_ to start search for next from here. @@ -643,7 +662,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, */ if (!error) error = shmem_add_to_page_cache(*pagep, mapping, index, - GFP_NOWAIT, radswap); + radswap); if (error != -ENOMEM) { /* * Truncation and eviction use free_swap_and_cache(), which @@ -657,7 +676,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, spin_unlock(&info->lock); swap_free(swap); } - error = 1; /* not an error, but entry was found */ } return error; } @@ -669,7 +687,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page) { struct list_head *this, *next; struct shmem_inode_info *info; - int found = 0; + struct mem_cgroup *memcg; int error = 0; /* @@ -684,26 +702,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page) * the shmem_swaplist_mutex which might hold up shmem_writepage(). * Charged back to the user (not to caller) when swap account is used. */ - error = mem_cgroup_charge_file(page, current->mm, GFP_KERNEL); + error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg); if (error) goto out; /* No radix_tree_preload: swap entry keeps a place for page in tree */ + error = -EAGAIN; mutex_lock(&shmem_swaplist_mutex); list_for_each_safe(this, next, &shmem_swaplist) { info = list_entry(this, struct shmem_inode_info, swaplist); if (info->swapped) - found = shmem_unuse_inode(info, swap, &page); + error = shmem_unuse_inode(info, swap, &page); else list_del_init(&info->swaplist); cond_resched(); - if (found) + if (error != -EAGAIN) break; + /* found nothing in this: move on to search the next */ } mutex_unlock(&shmem_swaplist_mutex); - if (found < 0) - error = found; + if (error) { + if (error != -ENOMEM) + error = 0; + mem_cgroup_cancel_charge(page, memcg); + } else + mem_cgroup_commit_charge(page, memcg, true); out: unlock_page(page); page_cache_release(page); @@ -760,7 +784,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; if (shmem_falloc && - !shmem_falloc->mode && + !shmem_falloc->waitq && index >= shmem_falloc->start && index < shmem_falloc->next) shmem_falloc->nr_unswapped++; @@ -807,7 +831,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - swapcache_free(swap, NULL); + swapcache_free(swap); redirty: set_page_dirty(page); if (wbc->for_reclaim) @@ -980,7 +1004,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, */ oldpage = newpage; } else { - mem_cgroup_replace_page_cache(oldpage, newpage); + mem_cgroup_migrate(oldpage, newpage, false); lru_cache_add_anon(newpage); *pagep = newpage; } @@ -1007,6 +1031,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo; + struct mem_cgroup *memcg; struct page *page; swp_entry_t swap; int error; @@ -1085,11 +1110,10 @@ repeat: goto failed; } - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, swp_to_radix_entry(swap)); + swp_to_radix_entry(swap)); /* * We already confirmed swap under page lock, and make * no memory allocation here, so usually no possibility @@ -1102,12 +1126,16 @@ repeat: * Reset swap.val? No, leave it so "failed" goes back to * "repeat": reading a hole and writing should succeed. */ - if (error) + if (error) { + mem_cgroup_cancel_charge(page, memcg); delete_from_swap_cache(page); + } } if (error) goto failed; + mem_cgroup_commit_charge(page, memcg, true); + spin_lock(&info->lock); info->swapped--; shmem_recalc_inode(inode); @@ -1143,22 +1171,22 @@ repeat: __SetPageSwapBacked(page); __set_page_locked(page); if (sgp == SGP_WRITE) - init_page_accessed(page); + __SetPageReferenced(page); - error = mem_cgroup_charge_file(page, current->mm, - gfp & GFP_RECLAIM_MASK); + error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg); if (error) goto decused; error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK); if (!error) { error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); + NULL); radix_tree_preload_end(); } if (error) { - mem_cgroup_uncharge_cache_page(page); + mem_cgroup_cancel_charge(page, memcg); goto decused; } + mem_cgroup_commit_charge(page, memcg, false); lru_cache_add_anon(page); spin_lock(&info->lock); @@ -1248,38 +1276,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn * locks writers out with its hold on i_mutex. So refrain from - * faulting pages into the hole while it's being punched, and - * wait on i_mutex to be released if vmf->flags permits. + * faulting pages into the hole while it's being punched. Although + * shmem_undo_range() does remove the additions, it may be unable to + * keep up, as each new page needs its own unmap_mapping_range() call, + * and the i_mmap tree grows ever slower to scan if new vmas are added. + * + * It does not matter if we sometimes reach this check just before the + * hole-punch begins, so that one fault then races with the punch: + * we just need to make racing faults a rare case. + * + * The implementation below would be much simpler if we just used a + * standard mutex or completion: but we cannot take i_mutex in fault, + * and bloating every shmem inode for this unlikely case would be sad. */ if (unlikely(inode->i_private)) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); shmem_falloc = inode->i_private; - if (!shmem_falloc || - shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE || - vmf->pgoff < shmem_falloc->start || - vmf->pgoff >= shmem_falloc->next) - shmem_falloc = NULL; - spin_unlock(&inode->i_lock); - /* - * i_lock has protected us from taking shmem_falloc seriously - * once return from shmem_fallocate() went back up that stack. - * i_lock does not serialize with i_mutex at all, but it does - * not matter if sometimes we wait unnecessarily, or sometimes - * miss out on waiting: we just need to make those cases rare. - */ - if (shmem_falloc) { + if (shmem_falloc && + shmem_falloc->waitq && + vmf->pgoff >= shmem_falloc->start && + vmf->pgoff < shmem_falloc->next) { + wait_queue_head_t *shmem_falloc_waitq; + DEFINE_WAIT(shmem_fault_wait); + + ret = VM_FAULT_NOPAGE; if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) && !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) { + /* It's polite to up mmap_sem if we can */ up_read(&vma->vm_mm->mmap_sem); - mutex_lock(&inode->i_mutex); - mutex_unlock(&inode->i_mutex); - return VM_FAULT_RETRY; + ret = VM_FAULT_RETRY; } - /* cond_resched? Leave that to GUP or return to user */ - return VM_FAULT_NOPAGE; + + shmem_falloc_waitq = shmem_falloc->waitq; + prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait, + TASK_KILLABLE); + spin_unlock(&inode->i_lock); + schedule(); + + /* + * shmem_falloc_waitq points into the shmem_fallocate() + * stack of the hole-punching task: shmem_falloc_waitq + * is usually invalid by the time we reach here, but + * finish_wait() does not dereference it in that case; + * though i_lock needed lest racing with wake_up_all(). + */ + spin_lock(&inode->i_lock); + finish_wait(shmem_falloc_waitq, &shmem_fault_wait); + spin_unlock(&inode->i_lock); + return ret; } + spin_unlock(&inode->i_lock); } error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); @@ -1774,13 +1822,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&inode->i_mutex); - shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE; - if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; loff_t unmap_start = round_up(offset, PAGE_SIZE); loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; + DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); + shmem_falloc.waitq = &shmem_falloc_waitq; shmem_falloc.start = unmap_start >> PAGE_SHIFT; shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT; spin_lock(&inode->i_lock); @@ -1792,8 +1840,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, 1 + unmap_end - unmap_start, 0); shmem_truncate_range(inode, offset, offset + len - 1); /* No need to unmap again: hole-punching leaves COWed pages */ + + spin_lock(&inode->i_lock); + inode->i_private = NULL; + wake_up_all(&shmem_falloc_waitq); + spin_unlock(&inode->i_lock); error = 0; - goto undone; + goto out; } /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ @@ -1809,6 +1862,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto out; } + shmem_falloc.waitq = NULL; shmem_falloc.start = start; shmem_falloc.next = start; shmem_falloc.nr_falloced = 0; @@ -2900,16 +2954,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, this.len = strlen(name); this.hash = 0; /* will go */ sb = shm_mnt->mnt_sb; + path.mnt = mntget(shm_mnt); path.dentry = d_alloc_pseudo(sb, &this); if (!path.dentry) goto put_memory; d_set_d_op(path.dentry, &anon_ops); - path.mnt = mntget(shm_mnt); res = ERR_PTR(-ENOSPC); inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto put_dentry; + goto put_memory; inode->i_flags |= i_flags; d_instantiate(path.dentry, inode); @@ -2917,19 +2971,19 @@ static struct file *__shmem_file_setup(const char *name, loff_t size, clear_nlink(inode); /* It is unlinked */ res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size)); if (IS_ERR(res)) - goto put_dentry; + goto put_path; res = alloc_file(&path, FMODE_WRITE | FMODE_READ, &shmem_file_operations); if (IS_ERR(res)) - goto put_dentry; + goto put_path; return res; -put_dentry: - path_put(&path); put_memory: shmem_unacct_size(flags, size); +put_path: + path_put(&path); return res; } diff --git a/mm/slab.c b/mm/slab.c index 3070b929a1bf..1351725f7936 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -191,7 +191,6 @@ struct array_cache { unsigned int limit; unsigned int batchcount; unsigned int touched; - spinlock_t lock; void *entry[]; /* * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing @@ -203,6 +202,11 @@ struct array_cache { */ }; +struct alien_cache { + spinlock_t lock; + struct array_cache ac; +}; + #define SLAB_OBJ_PFMEMALLOC 1 static inline bool is_obj_pfmemalloc(void *objp) { @@ -242,7 +246,8 @@ static struct kmem_cache_node __initdata init_kmem_cache_node[NUM_INIT_LISTS]; static int drain_freelist(struct kmem_cache *cache, struct kmem_cache_node *n, int tofree); static void free_block(struct kmem_cache *cachep, void **objpp, int len, - int node); + int node, struct list_head *list); +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list); static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp); static void cache_reap(struct work_struct *unused); @@ -267,7 +272,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) #define MAKE_LIST(cachep, listp, slab, nodeid) \ do { \ INIT_LIST_HEAD(listp); \ - list_splice(&(cachep->node[nodeid]->slab), listp); \ + list_splice(&get_node(cachep, nodeid)->slab, listp); \ } while (0) #define MAKE_ALL_LISTS(cachep, ptr, nodeid) \ @@ -465,143 +470,6 @@ static struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", }; -#define BAD_ALIEN_MAGIC 0x01020304ul - -#ifdef CONFIG_LOCKDEP - -/* - * Slab sometimes uses the kmalloc slabs to store the slab headers - * for other slabs "off slab". - * The locking for this is tricky in that it nests within the locks - * of all other slabs in a few places; to deal with this special - * locking we put on-slab caches into a separate lock-class. - * - * We set lock class for alien array caches which are up during init. - * The lock annotation will be lost if all cpus of a node goes down and - * then comes back up during hotplug - */ -static struct lock_class_key on_slab_l3_key; -static struct lock_class_key on_slab_alc_key; - -static struct lock_class_key debugobj_l3_key; -static struct lock_class_key debugobj_alc_key; - -static void slab_set_lock_classes(struct kmem_cache *cachep, - struct lock_class_key *l3_key, struct lock_class_key *alc_key, - int q) -{ - struct array_cache **alc; - struct kmem_cache_node *n; - int r; - - n = cachep->node[q]; - if (!n) - return; - - lockdep_set_class(&n->list_lock, l3_key); - alc = n->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - return; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, alc_key); - } -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ - slab_set_lock_classes(cachep, &debugobj_l3_key, &debugobj_alc_key, node); -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ - int node; - - for_each_online_node(node) - slab_set_debugobj_lock_classes_node(cachep, node); -} - -static void init_node_lock_keys(int q) -{ - int i; - - if (slab_state < UP) - return; - - for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache_node *n; - struct kmem_cache *cache = kmalloc_caches[i]; - - if (!cache) - continue; - - n = cache->node[q]; - if (!n || OFF_SLAB(cache)) - continue; - - slab_set_lock_classes(cache, &on_slab_l3_key, - &on_slab_alc_key, q); - } -} - -static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q) -{ - if (!cachep->node[q]) - return; - - slab_set_lock_classes(cachep, &on_slab_l3_key, - &on_slab_alc_key, q); -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ - int node; - - VM_BUG_ON(OFF_SLAB(cachep)); - for_each_node(node) - on_slab_lock_classes_node(cachep, node); -} - -static inline void init_lock_keys(void) -{ - int node; - - for_each_node(node) - init_node_lock_keys(node); -} -#else -static void init_node_lock_keys(int q) -{ -} - -static inline void init_lock_keys(void) -{ -} - -static inline void on_slab_lock_classes(struct kmem_cache *cachep) -{ -} - -static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node) -{ -} - -static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep) -{ -} -#endif - static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) @@ -792,13 +660,8 @@ static void start_cpu_timer(int cpu) } } -static struct array_cache *alloc_arraycache(int node, int entries, - int batchcount, gfp_t gfp) +static void init_arraycache(struct array_cache *ac, int limit, int batch) { - int memsize = sizeof(void *) * entries + sizeof(struct array_cache); - struct array_cache *nc = NULL; - - nc = kmalloc_node(memsize, gfp, node); /* * The array_cache structures contain pointers to free object. * However, when such objects are allocated or transferred to another @@ -806,15 +669,24 @@ static struct array_cache *alloc_arraycache(int node, int entries, * valid references during a kmemleak scan. Therefore, kmemleak must * not scan such objects. */ - kmemleak_no_scan(nc); - if (nc) { - nc->avail = 0; - nc->limit = entries; - nc->batchcount = batchcount; - nc->touched = 0; - spin_lock_init(&nc->lock); + kmemleak_no_scan(ac); + if (ac) { + ac->avail = 0; + ac->limit = limit; + ac->batchcount = batch; + ac->touched = 0; } - return nc; +} + +static struct array_cache *alloc_arraycache(int node, int entries, + int batchcount, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct array_cache); + struct array_cache *ac = NULL; + + ac = kmalloc_node(memsize, gfp, node); + init_arraycache(ac, entries, batchcount); + return ac; } static inline bool is_slab_pfmemalloc(struct page *page) @@ -826,7 +698,7 @@ static inline bool is_slab_pfmemalloc(struct page *page) static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { - struct kmem_cache_node *n = cachep->node[numa_mem_id()]; + struct kmem_cache_node *n = get_node(cachep, numa_mem_id()); struct page *page; unsigned long flags; @@ -881,7 +753,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, * If there are empty slabs on the slabs_free list and we are * being forced to refill the cache, mark this one !pfmemalloc. */ - n = cachep->node[numa_mem_id()]; + n = get_node(cachep, numa_mem_id()); if (!list_empty(&n->slabs_free) && force_refill) { struct page *page = virt_to_head_page(objp); ClearPageSlabPfmemalloc(page); @@ -961,12 +833,13 @@ static int transfer_objects(struct array_cache *to, #define drain_alien_cache(cachep, alien) do { } while (0) #define reap_alien(cachep, n) do { } while (0) -static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static inline struct alien_cache **alloc_alien_cache(int node, + int limit, gfp_t gfp) { - return (struct array_cache **)BAD_ALIEN_MAGIC; + return NULL; } -static inline void free_alien_cache(struct array_cache **ac_ptr) +static inline void free_alien_cache(struct alien_cache **ac_ptr) { } @@ -992,46 +865,60 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep, static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); static void *alternate_node_alloc(struct kmem_cache *, gfp_t); -static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) +static struct alien_cache *__alloc_alien_cache(int node, int entries, + int batch, gfp_t gfp) +{ + size_t memsize = sizeof(void *) * entries + sizeof(struct alien_cache); + struct alien_cache *alc = NULL; + + alc = kmalloc_node(memsize, gfp, node); + init_arraycache(&alc->ac, entries, batch); + spin_lock_init(&alc->lock); + return alc; +} + +static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) { - struct array_cache **ac_ptr; - int memsize = sizeof(void *) * nr_node_ids; + struct alien_cache **alc_ptr; + size_t memsize = sizeof(void *) * nr_node_ids; int i; if (limit > 1) limit = 12; - ac_ptr = kzalloc_node(memsize, gfp, node); - if (ac_ptr) { - for_each_node(i) { - if (i == node || !node_online(i)) - continue; - ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); - if (!ac_ptr[i]) { - for (i--; i >= 0; i--) - kfree(ac_ptr[i]); - kfree(ac_ptr); - return NULL; - } + alc_ptr = kzalloc_node(memsize, gfp, node); + if (!alc_ptr) + return NULL; + + for_each_node(i) { + if (i == node || !node_online(i)) + continue; + alc_ptr[i] = __alloc_alien_cache(node, limit, 0xbaadf00d, gfp); + if (!alc_ptr[i]) { + for (i--; i >= 0; i--) + kfree(alc_ptr[i]); + kfree(alc_ptr); + return NULL; } } - return ac_ptr; + return alc_ptr; } -static void free_alien_cache(struct array_cache **ac_ptr) +static void free_alien_cache(struct alien_cache **alc_ptr) { int i; - if (!ac_ptr) + if (!alc_ptr) return; for_each_node(i) - kfree(ac_ptr[i]); - kfree(ac_ptr); + kfree(alc_ptr[i]); + kfree(alc_ptr); } static void __drain_alien_cache(struct kmem_cache *cachep, - struct array_cache *ac, int node) + struct array_cache *ac, int node, + struct list_head *list) { - struct kmem_cache_node *n = cachep->node[node]; + struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { spin_lock(&n->list_lock); @@ -1043,7 +930,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, if (n->shared) transfer_objects(n->shared, ac, ac->limit); - free_block(cachep, ac->entry, ac->avail, node); + free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; spin_unlock(&n->list_lock); } @@ -1057,28 +944,40 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *n) int node = __this_cpu_read(slab_reap_node); if (n->alien) { - struct array_cache *ac = n->alien[node]; + struct alien_cache *alc = n->alien[node]; + struct array_cache *ac; + + if (alc) { + ac = &alc->ac; + if (ac->avail && spin_trylock_irq(&alc->lock)) { + LIST_HEAD(list); - if (ac && ac->avail && spin_trylock_irq(&ac->lock)) { - __drain_alien_cache(cachep, ac, node); - spin_unlock_irq(&ac->lock); + __drain_alien_cache(cachep, ac, node, &list); + spin_unlock_irq(&alc->lock); + slabs_destroy(cachep, &list); + } } } } static void drain_alien_cache(struct kmem_cache *cachep, - struct array_cache **alien) + struct alien_cache **alien) { int i = 0; + struct alien_cache *alc; struct array_cache *ac; unsigned long flags; for_each_online_node(i) { - ac = alien[i]; - if (ac) { - spin_lock_irqsave(&ac->lock, flags); - __drain_alien_cache(cachep, ac, i); - spin_unlock_irqrestore(&ac->lock, flags); + alc = alien[i]; + if (alc) { + LIST_HEAD(list); + + ac = &alc->ac; + spin_lock_irqsave(&alc->lock, flags); + __drain_alien_cache(cachep, ac, i, &list); + spin_unlock_irqrestore(&alc->lock, flags); + slabs_destroy(cachep, &list); } } } @@ -1087,8 +986,10 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { int nodeid = page_to_nid(virt_to_page(objp)); struct kmem_cache_node *n; - struct array_cache *alien = NULL; + struct alien_cache *alien = NULL; + struct array_cache *ac; int node; + LIST_HEAD(list); node = numa_mem_id(); @@ -1099,21 +1000,25 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) if (likely(nodeid == node)) return 0; - n = cachep->node[node]; + n = get_node(cachep, node); STATS_INC_NODEFREES(cachep); if (n->alien && n->alien[nodeid]) { alien = n->alien[nodeid]; + ac = &alien->ac; spin_lock(&alien->lock); - if (unlikely(alien->avail == alien->limit)) { + if (unlikely(ac->avail == ac->limit)) { STATS_INC_ACOVERFLOW(cachep); - __drain_alien_cache(cachep, alien, nodeid); + __drain_alien_cache(cachep, ac, nodeid, &list); } - ac_put_obj(cachep, alien, objp); + ac_put_obj(cachep, ac, objp); spin_unlock(&alien->lock); + slabs_destroy(cachep, &list); } else { - spin_lock(&(cachep->node[nodeid])->list_lock); - free_block(cachep, &objp, 1, nodeid); - spin_unlock(&(cachep->node[nodeid])->list_lock); + n = get_node(cachep, nodeid); + spin_lock(&n->list_lock); + free_block(cachep, &objp, 1, nodeid, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); } return 1; } @@ -1132,7 +1037,7 @@ static int init_cache_node_node(int node) { struct kmem_cache *cachep; struct kmem_cache_node *n; - const int memsize = sizeof(struct kmem_cache_node); + const size_t memsize = sizeof(struct kmem_cache_node); list_for_each_entry(cachep, &slab_caches, list) { /* @@ -1140,7 +1045,8 @@ static int init_cache_node_node(int node) * begin anything. Make sure some other cpu on this * node has not already allocated this */ - if (!cachep->node[node]) { + n = get_node(cachep, node); + if (!n) { n = kmalloc_node(memsize, GFP_KERNEL, node); if (!n) return -ENOMEM; @@ -1156,11 +1062,11 @@ static int init_cache_node_node(int node) cachep->node[node] = n; } - spin_lock_irq(&cachep->node[node]->list_lock); - cachep->node[node]->free_limit = + spin_lock_irq(&n->list_lock); + n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->node[node]->list_lock); + spin_unlock_irq(&n->list_lock); } return 0; } @@ -1181,12 +1087,13 @@ static void cpuup_canceled(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared; - struct array_cache **alien; + struct alien_cache **alien; + LIST_HEAD(list); /* cpu is dead; no one can alloc from it. */ nc = cachep->array[cpu]; cachep->array[cpu] = NULL; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) goto free_array_cache; @@ -1196,7 +1103,7 @@ static void cpuup_canceled(long cpu) /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; if (nc) - free_block(cachep, nc->entry, nc->avail, node); + free_block(cachep, nc->entry, nc->avail, node, &list); if (!cpumask_empty(mask)) { spin_unlock_irq(&n->list_lock); @@ -1206,7 +1113,7 @@ static void cpuup_canceled(long cpu) shared = n->shared; if (shared) { free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = NULL; } @@ -1221,6 +1128,7 @@ static void cpuup_canceled(long cpu) free_alien_cache(alien); } free_array_cache: + slabs_destroy(cachep, &list); kfree(nc); } /* @@ -1229,7 +1137,7 @@ free_array_cache: * shrink each nodelist to its limit. */ list_for_each_entry(cachep, &slab_caches, list) { - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; drain_freelist(cachep, n, slabs_tofree(cachep, n)); @@ -1260,7 +1168,7 @@ static int cpuup_prepare(long cpu) list_for_each_entry(cachep, &slab_caches, list) { struct array_cache *nc; struct array_cache *shared = NULL; - struct array_cache **alien = NULL; + struct alien_cache **alien = NULL; nc = alloc_arraycache(node, cachep->limit, cachep->batchcount, GFP_KERNEL); @@ -1284,7 +1192,7 @@ static int cpuup_prepare(long cpu) } } cachep->array[cpu] = nc; - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(!n); spin_lock_irq(&n->list_lock); @@ -1305,13 +1213,7 @@ static int cpuup_prepare(long cpu) spin_unlock_irq(&n->list_lock); kfree(shared); free_alien_cache(alien); - if (cachep->flags & SLAB_DEBUG_OBJECTS) - slab_set_debugobj_lock_classes_node(cachep, node); - else if (!OFF_SLAB(cachep) && - !(cachep->flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes_node(cachep, node); } - init_node_lock_keys(node); return 0; bad: @@ -1395,7 +1297,7 @@ static int __meminit drain_cache_node_node(int node) list_for_each_entry(cachep, &slab_caches, list) { struct kmem_cache_node *n; - n = cachep->node[node]; + n = get_node(cachep, node); if (!n) continue; @@ -1575,10 +1477,6 @@ void __init kmem_cache_init(void) memcpy(ptr, cpu_cache_get(kmem_cache), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmem_cache->array[smp_processor_id()] = ptr; @@ -1588,10 +1486,6 @@ void __init kmem_cache_init(void) != &initarray_generic.cache); memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]), sizeof(struct arraycache_init)); - /* - * Do not assume that spinlocks can be initialized via memcpy: - */ - spin_lock_init(&ptr->lock); kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr; } @@ -1628,9 +1522,6 @@ void __init kmem_cache_init_late(void) BUG(); mutex_unlock(&slab_mutex); - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); - /* Done! */ slab_state = FULL; @@ -1690,14 +1581,10 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", cachep->name, cachep->size, cachep->gfporder); - for_each_online_node(node) { + for_each_kmem_cache_node(cachep, node, n) { unsigned long active_objs = 0, num_objs = 0, free_objects = 0; unsigned long active_slabs = 0, num_slabs = 0; - n = cachep->node[node]; - if (!n) - continue; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->slabs_full, lru) { active_objs += cachep->num; @@ -2060,6 +1947,16 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) +{ + struct page *page, *n; + + list_for_each_entry_safe(page, n, list, lru) { + list_del(&page->lru); + slab_destroy(cachep, page); + } +} + /** * calculate_slab_order - calculate size (page order) of slabs * @cachep: pointer to the cache that is being created @@ -2405,17 +2302,6 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) return err; } - if (flags & SLAB_DEBUG_OBJECTS) { - /* - * Would deadlock through slab_destroy()->call_rcu()-> - * debug_object_activate()->kmem_cache_alloc(). - */ - WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU); - - slab_set_debugobj_lock_classes(cachep); - } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU)) - on_slab_lock_classes(cachep); - return 0; } @@ -2434,7 +2320,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock); + assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2442,7 +2328,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->node[node]->list_lock); + assert_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2462,12 +2348,16 @@ static void do_drain(void *arg) struct kmem_cache *cachep = arg; struct array_cache *ac; int node = numa_mem_id(); + struct kmem_cache_node *n; + LIST_HEAD(list); check_irq_off(); ac = cpu_cache_get(cachep); - spin_lock(&cachep->node[node]->list_lock); - free_block(cachep, ac->entry, ac->avail, node); - spin_unlock(&cachep->node[node]->list_lock); + n = get_node(cachep, node); + spin_lock(&n->list_lock); + free_block(cachep, ac->entry, ac->avail, node, &list); + spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail = 0; } @@ -2478,17 +2368,12 @@ static void drain_cpu_caches(struct kmem_cache *cachep) on_each_cpu(do_drain, cachep, 1); check_irq_on(); - for_each_online_node(node) { - n = cachep->node[node]; - if (n && n->alien) + for_each_kmem_cache_node(cachep, node, n) + if (n->alien) drain_alien_cache(cachep, n->alien); - } - for_each_online_node(node) { - n = cachep->node[node]; - if (n) - drain_array(cachep, n, n->shared, 1, node); - } + for_each_kmem_cache_node(cachep, node, n) + drain_array(cachep, n, n->shared, 1, node); } /* @@ -2534,17 +2419,14 @@ out: int __kmem_cache_shrink(struct kmem_cache *cachep) { - int ret = 0, i = 0; + int ret = 0; + int node; struct kmem_cache_node *n; drain_cpu_caches(cachep); check_irq_on(); - for_each_online_node(i) { - n = cachep->node[i]; - if (!n) - continue; - + for_each_kmem_cache_node(cachep, node, n) { drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || @@ -2566,13 +2448,11 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) kfree(cachep->array[i]); /* NUMA: free the node structures */ - for_each_online_node(i) { - n = cachep->node[i]; - if (n) { - kfree(n->shared); - free_alien_cache(n->alien); - kfree(n); - } + for_each_kmem_cache_node(cachep, i, n) { + kfree(n->shared); + free_alien_cache(n->alien); + kfree(n); + cachep->node[i] = NULL; } return 0; } @@ -2751,7 +2631,7 @@ static int cache_grow(struct kmem_cache *cachep, /* Take the node list lock to change the colour_next on this node */ check_irq_off(); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); spin_lock(&n->list_lock); /* Get colour for the slab, and cal the next value. */ @@ -2920,7 +2800,7 @@ retry: */ batchcount = BATCHREFILL_LIMIT; } - n = cachep->node[node]; + n = get_node(cachep, node); BUG_ON(ac->avail > 0 || !n); spin_lock(&n->list_lock); @@ -3060,7 +2940,7 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) { - if (cachep == kmem_cache) + if (unlikely(cachep == kmem_cache)) return false; return should_failslab(cachep->object_size, flags, cachep->flags); @@ -3169,8 +3049,8 @@ retry: nid = zone_to_nid(zone); if (cpuset_zone_allowed_hardwall(zone, flags) && - cache->node[nid] && - cache->node[nid]->free_objects) { + get_node(cache, nid) && + get_node(cache, nid)->free_objects) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (obj) @@ -3233,7 +3113,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int x; VM_BUG_ON(nodeid > num_online_nodes()); - n = cachep->node[nodeid]; + n = get_node(cachep, nodeid); BUG_ON(!n); retry: @@ -3304,7 +3184,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, if (nodeid == NUMA_NO_NODE) nodeid = slab_node; - if (unlikely(!cachep->node[nodeid])) { + if (unlikely(!get_node(cachep, nodeid))) { /* Node not bootstrapped yet */ ptr = fallback_alloc(cachep, flags); goto out; @@ -3405,12 +3285,13 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller) /* * Caller needs to acquire correct kmem_cache_node's list_lock + * @list: List of detached free slabs should be freed by caller */ -static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, - int node) +static void free_block(struct kmem_cache *cachep, void **objpp, + int nr_objects, int node, struct list_head *list) { int i; - struct kmem_cache_node *n; + struct kmem_cache_node *n = get_node(cachep, node); for (i = 0; i < nr_objects; i++) { void *objp; @@ -3420,7 +3301,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, objp = objpp[i]; page = virt_to_head_page(objp); - n = cachep->node[node]; list_del(&page->lru); check_spinlock_acquired_node(cachep, node); slab_put_obj(cachep, page, objp, node); @@ -3431,13 +3311,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, if (page->active == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; - /* No need to drop any previously held - * lock here, even if we have a off-slab slab - * descriptor it is guaranteed to come from - * a different cache, refer to comments before - * alloc_slabmgmt. - */ - slab_destroy(cachep, page); + list_add_tail(&page->lru, list); } else { list_add(&page->lru, &n->slabs_free); } @@ -3456,13 +3330,14 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) int batchcount; struct kmem_cache_node *n; int node = numa_mem_id(); + LIST_HEAD(list); batchcount = ac->batchcount; #if DEBUG BUG_ON(!batchcount || batchcount > ac->avail); #endif check_irq_off(); - n = cachep->node[node]; + n = get_node(cachep, node); spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; @@ -3477,7 +3352,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) } } - free_block(cachep, ac->entry, batchcount, node); + free_block(cachep, ac->entry, batchcount, node, &list); free_done: #if STATS { @@ -3498,6 +3373,7 @@ free_done: } #endif spin_unlock(&n->list_lock); + slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); } @@ -3754,7 +3630,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) int node; struct kmem_cache_node *n; struct array_cache *new_shared; - struct array_cache **new_alien = NULL; + struct alien_cache **new_alien = NULL; for_each_online_node(node) { @@ -3775,15 +3651,16 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) } } - n = cachep->node[node]; + n = get_node(cachep, node); if (n) { struct array_cache *shared = n->shared; + LIST_HEAD(list); spin_lock_irq(&n->list_lock); if (shared) free_block(cachep, shared->entry, - shared->avail, node); + shared->avail, node, &list); n->shared = new_shared; if (!n->alien) { @@ -3793,6 +3670,7 @@ static int alloc_kmem_cache_node(struct kmem_cache *cachep, gfp_t gfp) n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(shared); free_alien_cache(new_alien); continue; @@ -3820,9 +3698,8 @@ fail: /* Cache is not active yet. Roll back what we did */ node--; while (node >= 0) { - if (cachep->node[node]) { - n = cachep->node[node]; - + n = get_node(cachep, node); + if (n) { kfree(n->shared); free_alien_cache(n->alien); kfree(n); @@ -3883,12 +3760,20 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, cachep->shared = shared; for_each_online_cpu(i) { + LIST_HEAD(list); struct array_cache *ccold = new->new[i]; + int node; + struct kmem_cache_node *n; + if (!ccold) continue; - spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); - spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock); + + node = cpu_to_mem(i); + n = get_node(cachep, node); + spin_lock_irq(&n->list_lock); + free_block(cachep, ccold->entry, ccold->avail, node, &list); + spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); kfree(ccold); } kfree(new); @@ -3996,6 +3881,7 @@ skip_setup: static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, struct array_cache *ac, int force, int node) { + LIST_HEAD(list); int tofree; if (!ac || !ac->avail) @@ -4008,12 +3894,13 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, tofree = force ? ac->avail : (ac->limit + 4) / 5; if (tofree > ac->avail) tofree = (ac->avail + 1) / 2; - free_block(cachep, ac->entry, tofree, node); + free_block(cachep, ac->entry, tofree, node, &list); ac->avail -= tofree; memmove(ac->entry, &(ac->entry[tofree]), sizeof(void *) * ac->avail); } spin_unlock_irq(&n->list_lock); + slabs_destroy(cachep, &list); } } @@ -4048,7 +3935,7 @@ static void cache_reap(struct work_struct *w) * have established with reasonable certainty that * we can do some work if the lock was obtained. */ - n = searchp->node[node]; + n = get_node(searchp, node); reap_alien(searchp, n); @@ -4100,10 +3987,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) active_objs = 0; num_slabs = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); @@ -4328,10 +4212,7 @@ static int leaks_show(struct seq_file *m, void *p) x[1] = 0; - for_each_online_node(node) { - n = cachep->node[node]; - if (!n) - continue; + for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); spin_lock_irq(&n->list_lock); diff --git a/mm/slab.h b/mm/slab.h index 961a3fb1f5a2..928823e17e58 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -260,9 +260,8 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) WARN_ON_ONCE(1); return s; } -#endif - +#ifndef CONFIG_SLOB /* * The slab lists for all objects. */ @@ -277,7 +276,7 @@ struct kmem_cache_node { unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ struct array_cache *shared; /* shared per node */ - struct array_cache **alien; /* on other nodes */ + struct alien_cache **alien; /* on other nodes */ unsigned long next_reap; /* updated without locking */ int free_touched; /* updated without locking */ #endif @@ -294,5 +293,22 @@ struct kmem_cache_node { }; +static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) +{ + return s->node[node]; +} + +/* + * Iterator over all nodes. The body will be executed for each node that has + * a kmem_cache_node structure allocated (which is true for all online nodes) + */ +#define for_each_kmem_cache_node(__s, __node, __n) \ + for (__node = 0; __n = get_node(__s, __node), __node < nr_node_ids; __node++) \ + if (__n) + +#endif + void *slab_next(struct seq_file *m, void *p, loff_t *pos); void slab_stop(struct seq_file *m, void *p); + +#endif /* MM_SLAB_H */ diff --git a/mm/slab_common.c b/mm/slab_common.c index d31c4bacc6a2..d319502b2403 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -19,6 +19,8 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> + +#define CREATE_TRACE_POINTS #include <trace/events/kmem.h> #include "slab.h" @@ -787,3 +789,102 @@ static int __init slab_proc_init(void) } module_init(slab_proc_init); #endif /* CONFIG_SLABINFO */ + +static __always_inline void *__do_krealloc(const void *p, size_t new_size, + gfp_t flags) +{ + void *ret; + size_t ks = 0; + + if (p) + ks = ksize(p); + + if (ks >= new_size) + return (void *)p; + + ret = kmalloc_track_caller(new_size, flags); + if (ret && p) + memcpy(ret, p, ks); + + return ret; +} + +/** + * __krealloc - like krealloc() but don't free @p. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * This function is like krealloc() except it never frees the originally + * allocated buffer. Use this if you don't want to free the buffer immediately + * like, for example, with RCU. + */ +void *__krealloc(const void *p, size_t new_size, gfp_t flags) +{ + if (unlikely(!new_size)) + return ZERO_SIZE_PTR; + + return __do_krealloc(p, new_size, flags); + +} +EXPORT_SYMBOL(__krealloc); + +/** + * krealloc - reallocate memory. The contents will remain unchanged. + * @p: object to reallocate memory for. + * @new_size: how many bytes of memory are required. + * @flags: the type of memory to allocate. + * + * The contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. If @p is %NULL, krealloc() + * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a + * %NULL pointer, the object pointed to is freed. + */ +void *krealloc(const void *p, size_t new_size, gfp_t flags) +{ + void *ret; + + if (unlikely(!new_size)) { + kfree(p); + return ZERO_SIZE_PTR; + } + + ret = __do_krealloc(p, new_size, flags); + if (ret && p != ret) + kfree(p); + + return ret; +} +EXPORT_SYMBOL(krealloc); + +/** + * kzfree - like kfree but zero memory + * @p: object to free memory of + * + * The memory of the object @p points to is zeroed before freed. + * If @p is %NULL, kzfree() does nothing. + * + * Note: this function zeroes the whole allocated buffer which can be a good + * deal bigger than the requested buffer size passed to kmalloc(). So be + * careful when using this function in performance sensitive code. + */ +void kzfree(const void *p) +{ + size_t ks; + void *mem = (void *)p; + + if (unlikely(ZERO_OR_NULL_PTR(mem))) + return; + ks = ksize(mem); + memset(mem, 0, ks); + kfree(mem); +} +EXPORT_SYMBOL(kzfree); + +/* Tracepoints definitions. */ +EXPORT_TRACEPOINT_SYMBOL(kmalloc); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); +EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); +EXPORT_TRACEPOINT_SYMBOL(kfree); +EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/slub.c b/mm/slub.c index 8c24a23fdafa..2b068c3638aa 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -233,11 +233,6 @@ static inline void stat(const struct kmem_cache *s, enum stat_item si) * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -288,6 +283,10 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -945,60 +944,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, - gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interrupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); -} - -/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache *s, @@ -1282,6 +1227,12 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) { kmemleak_alloc(ptr, size, 1, flags); @@ -1293,21 +1244,44 @@ static inline void kfree_hook(const void *x) } static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->object_size, flags, s->flags); +} -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) +static inline void slab_post_alloc_hook(struct kmem_cache *s, + gfp_t flags, void *object) { - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); } static inline void slab_free_hook(struct kmem_cache *s, void *x) { kmemleak_free_recursive(x, s->flags); -} -#endif /* CONFIG_SLUB_DEBUG */ + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); +} /* * Slab allocation and freeing @@ -1409,9 +1383,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; void *start; - void *last; void *p; int order; + int idx; BUG_ON(flags & GFP_SLAB_BUG_MASK); @@ -1432,14 +1406,13 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; @@ -2162,6 +2135,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) return; @@ -2176,15 +2150,11 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); @@ -2928,13 +2898,10 @@ static void early_kmem_cache_node_alloc(int node) static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -3224,12 +3191,11 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3414,9 +3380,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { if (!n->nr_partial) continue; @@ -3588,6 +3552,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3597,19 +3562,16 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache) * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } list_add(&s->list, &slab_caches); return s; @@ -3962,16 +3924,14 @@ static long validate_slab_cache(struct kmem_cache *s) unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4125,6 +4085,7 @@ static int list_locations(struct kmem_cache *s, char *buf, int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4134,8 +4095,7 @@ static int list_locations(struct kmem_cache *s, char *buf, /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4207,7 +4167,7 @@ static int list_locations(struct kmem_cache *s, char *buf, #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; @@ -4334,8 +4294,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = atomic_long_read(&n->total_objects); @@ -4351,9 +4312,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4366,7 +4327,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); @@ -4380,16 +4341,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -5344,13 +5301,9 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) { nr_slabs += node_nr_slabs(n); nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); diff --git a/mm/swap.c b/mm/swap.c index 9e8e3472248b..6b2dc3897cd5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -62,6 +62,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru_list(page, lruvec, page_off_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } + mem_cgroup_uncharge(page); } static void __put_single_page(struct page *page) @@ -501,7 +502,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); - trace_mm_lru_activate(page, page_to_pfn(page)); + trace_mm_lru_activate(page); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -589,6 +590,9 @@ static void __lru_cache_activate_page(struct page *page) * inactive,unreferenced -> inactive,referenced * inactive,referenced -> active,unreferenced * active,unreferenced -> active,referenced + * + * When a newly allocated page is not yet visible, so safe for non-atomic ops, + * __SetPageReferenced(page) may be substituted for mark_page_accessed(page). */ void mark_page_accessed(struct page *page) { @@ -614,17 +618,6 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Used to mark_page_accessed(page) that is not visible yet and when it is - * still safe to use non-atomic ops - */ -void init_page_accessed(struct page *page) -{ - if (!PageReferenced(page)) - __SetPageReferenced(page); -} -EXPORT_SYMBOL(init_page_accessed); - static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -695,6 +688,40 @@ void add_page_to_unevictable_list(struct page *page) spin_unlock_irq(&zone->lru_lock); } +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * Place @page on the active or unevictable LRU list, depending on its + * evictability. Note that if the page is not evictable, it goes + * directly back onto it's zone's unevictable list, it does NOT use a + * per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + VM_BUG_ON_PAGE(PageLRU(page), page); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { + SetPageActive(page); + lru_cache_add(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); +} + /* * If the page can not be invalidated, it is moved to the * inactive list to speed up its reclaim. It is moved to the @@ -921,6 +948,7 @@ void release_pages(struct page **pages, int nr, bool cold) if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); + mem_cgroup_uncharge_list(&pages_to_free); free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); @@ -996,7 +1024,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, SetPageLRU(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); - trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); + trace_mm_lru_insertion(page, lru); } /* diff --git a/mm/swap_state.c b/mm/swap_state.c index 2972eee184a4..e160151da6b8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -176,7 +176,7 @@ int add_to_swap(struct page *page, struct list_head *list) if (unlikely(PageTransHuge(page))) if (unlikely(split_huge_page_to_list(page, list))) { - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } @@ -202,7 +202,7 @@ int add_to_swap(struct page *page, struct list_head *list) * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); return 0; } } @@ -225,7 +225,7 @@ void delete_from_swap_cache(struct page *page) __delete_from_swap_cache(page); spin_unlock_irq(&address_space->tree_lock); - swapcache_free(entry, page); + swapcache_free(entry); page_cache_release(page); } @@ -386,7 +386,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) diff --git a/mm/swapfile.c b/mm/swapfile.c index 4c524f7bd0bf..8798b2e0ac59 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -843,16 +843,13 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void swapcache_free(swp_entry_t entry, struct page *page) +void swapcache_free(swp_entry_t entry) { struct swap_info_struct *p; - unsigned char count; p = swap_info_get(entry); if (p) { - count = swap_entry_free(p, entry, SWAP_HAS_CACHE); - if (page) - mem_cgroup_uncharge_swapcache(page, entry, count != 0); + swap_entry_free(p, entry, SWAP_HAS_CACHE); spin_unlock(&p->lock); } } @@ -1106,15 +1103,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; - if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, - GFP_KERNEL, &memcg)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) { ret = -ENOMEM; goto out_nolock; } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { - mem_cgroup_cancel_charge_swapin(memcg); + mem_cgroup_cancel_charge(page, memcg); ret = 0; goto out; } @@ -1124,11 +1120,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); - if (page == swapcache) + if (page == swapcache) { page_add_anon_rmap(page, vma, addr); - else /* ksm created a completely new copy */ + mem_cgroup_commit_charge(page, memcg, true); + } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr); - mem_cgroup_commit_charge_swapin(page, memcg); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); + } swap_free(entry); /* * Move the page to the active list so it is not diff --git a/mm/truncate.c b/mm/truncate.c index 6a78c814bebf..96d167372d89 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -281,7 +281,6 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -307,7 +306,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -355,26 +353,30 @@ void truncate_inode_pages_range(struct address_space *mapping, for ( ; ; ) { cond_resched(); if (!pagevec_lookup_entries(&pvec, mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - indices)) { + min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* If all gone from start onwards, we're done */ if (index == start) break; + /* Otherwise restart to make sure all gone */ index = start; continue; } if (index == start && indices[0] >= end) { + /* All gone out of hole to be punched, we're done */ pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); break; } - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; /* We rely upon deletion not changing page->index */ index = indices[i]; - if (index >= end) + if (index >= end) { + /* Restart punch to make sure all gone */ + index = start - 1; break; + } if (radix_tree_exceptional_entry(page)) { clear_exceptional_entry(mapping, index, page); @@ -389,7 +391,6 @@ void truncate_inode_pages_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); index++; } cleancache_invalidate_inode(mapping); @@ -488,7 +489,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -517,7 +517,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } @@ -548,7 +547,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) BUG_ON(page_has_private(page)); __delete_from_page_cache(page, NULL); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (mapping->a_ops->freepage) mapping->a_ops->freepage(page); @@ -597,7 +595,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { - mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -650,7 +647,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); - mem_cgroup_uncharge_end(); cond_resched(); index++; } diff --git a/mm/util.c b/mm/util.c index d5ea733c5082..ff7a52e7386e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -3,6 +3,7 @@ #include <linux/string.h> #include <linux/compiler.h> #include <linux/export.h> +#include <linux/ctype.h> #include <linux/err.h> #include <linux/sched.h> #include <linux/security.h> @@ -16,9 +17,6 @@ #include "internal.h" -#define CREATE_TRACE_POINTS -#include <trace/events/kmem.h> - /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -65,6 +63,35 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) EXPORT_SYMBOL(kstrndup); /** + * kstrimdup - Trim and copy a %NUL terminated string. + * @s: the string to trim and duplicate + * @gfp: the GFP mask used in the kmalloc() call when allocating memory + * + * Returns an address, which the caller must kfree, containing + * a duplicate of the passed string with leading and/or trailing + * whitespace (as defined by isspace) removed. + */ +char *kstrimdup(const char *s, gfp_t gfp) +{ + char *buf; + char *begin = skip_spaces(s); + size_t len = strlen(begin); + + while (len && isspace(begin[len - 1])) + len--; + + buf = kmalloc_track_caller(len + 1, gfp); + if (!buf) + return NULL; + + memcpy(buf, begin, len); + buf[len] = '\0'; + + return buf; +} +EXPORT_SYMBOL(kstrimdup); + +/** * kmemdup - duplicate region of memory * * @src: memory region to duplicate @@ -112,97 +139,6 @@ void *memdup_user(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user); -static __always_inline void *__do_krealloc(const void *p, size_t new_size, - gfp_t flags) -{ - void *ret; - size_t ks = 0; - - if (p) - ks = ksize(p); - - if (ks >= new_size) - return (void *)p; - - ret = kmalloc_track_caller(new_size, flags); - if (ret && p) - memcpy(ret, p, ks); - - return ret; -} - -/** - * __krealloc - like krealloc() but don't free @p. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * This function is like krealloc() except it never frees the originally - * allocated buffer. Use this if you don't want to free the buffer immediately - * like, for example, with RCU. - */ -void *__krealloc(const void *p, size_t new_size, gfp_t flags) -{ - if (unlikely(!new_size)) - return ZERO_SIZE_PTR; - - return __do_krealloc(p, new_size, flags); - -} -EXPORT_SYMBOL(__krealloc); - -/** - * krealloc - reallocate memory. The contents will remain unchanged. - * @p: object to reallocate memory for. - * @new_size: how many bytes of memory are required. - * @flags: the type of memory to allocate. - * - * The contents of the object pointed to are preserved up to the - * lesser of the new and old sizes. If @p is %NULL, krealloc() - * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a - * %NULL pointer, the object pointed to is freed. - */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) -{ - void *ret; - - if (unlikely(!new_size)) { - kfree(p); - return ZERO_SIZE_PTR; - } - - ret = __do_krealloc(p, new_size, flags); - if (ret && p != ret) - kfree(p); - - return ret; -} -EXPORT_SYMBOL(krealloc); - -/** - * kzfree - like kfree but zero memory - * @p: object to free memory of - * - * The memory of the object @p points to is zeroed before freed. - * If @p is %NULL, kzfree() does nothing. - * - * Note: this function zeroes the whole allocated buffer which can be a good - * deal bigger than the requested buffer size passed to kmalloc(). So be - * careful when using this function in performance sensitive code. - */ -void kzfree(const void *p) -{ - size_t ks; - void *mem = (void *)p; - - if (unlikely(ZERO_OR_NULL_PTR(mem))) - return; - ks = ksize(mem); - memset(mem, 0, ks); - kfree(mem); -} -EXPORT_SYMBOL(kzfree); - /* * strndup_user - duplicate an existing string from user space * @s: The string to duplicate @@ -504,11 +440,3 @@ out_mm: out: return res; } - -/* Tracepoints definitions. */ -EXPORT_TRACEPOINT_SYMBOL(kmalloc); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node); -EXPORT_TRACEPOINT_SYMBOL(kfree); -EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f64632b67196..2b0aa5486092 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1270,19 +1270,15 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) } EXPORT_SYMBOL_GPL(unmap_kernel_range); -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) { unsigned long addr = (unsigned long)area->addr; unsigned long end = addr + get_vm_area_size(area); int err; - err = vmap_page_range(addr, end, prot, *pages); - if (err > 0) { - *pages += err; - err = 0; - } + err = vmap_page_range(addr, end, prot, pages); - return err; + return err > 0 ? 0 : err; } EXPORT_SYMBOL_GPL(map_vm_area); @@ -1548,7 +1544,7 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, &pages)) { + if (map_vm_area(area, prot, pages)) { vunmap(area->addr); return NULL; } @@ -1566,7 +1562,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, const int order = 0; struct page **pages; unsigned int nr_pages, array_size, i; - gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1589,12 +1586,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) { struct page *page; - gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; if (node == NUMA_NO_NODE) - page = alloc_page(tmp_mask); + page = alloc_page(alloc_mask); else - page = alloc_pages_node(node, tmp_mask, order); + page = alloc_pages_node(node, alloc_mask, order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ @@ -1602,9 +1598,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } area->pages[i] = page; + if (gfp_mask & __GFP_WAIT) + cond_resched(); } - if (map_vm_area(area, prot, &pages)) + if (map_vm_area(area, prot, pages)) goto fail; return area->addr; @@ -2690,14 +2688,14 @@ void get_vmalloc_info(struct vmalloc_info *vmi) prev_end = VMALLOC_START; - spin_lock(&vmap_area_lock); + rcu_read_lock(); if (list_empty(&vmap_area_list)) { vmi->largest_chunk = VMALLOC_TOTAL; goto out; } - list_for_each_entry(va, &vmap_area_list, list) { + list_for_each_entry_rcu(va, &vmap_area_list, list) { unsigned long addr = va->va_start; /* @@ -2724,7 +2722,7 @@ void get_vmalloc_info(struct vmalloc_info *vmi) vmi->largest_chunk = VMALLOC_END - prev_end; out: - spin_unlock(&vmap_area_lock); + rcu_read_unlock(); } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 0f16ffe8eb67..27432baea971 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -58,36 +58,28 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h> -struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; +/* Scan control flags */ +#define MAY_WRITEPAGE 0x1 +#define MAY_UNMAP 0x2 +#define MAY_SWAP 0x4 +#define MAY_SKIP_CONGESTION 0x8 +#define COMPACTION_READY 0x10 +struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; - - /* anon vs. file LRUs scanning "ratio" */ - int swappiness; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -95,11 +87,17 @@ struct scan_control { */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + /* Scan control flags; see above */ + unsigned int flags; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -136,7 +134,11 @@ struct scan_control { * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -169,7 +171,8 @@ static unsigned long zone_reclaimable_pages(struct zone *zone) bool zone_reclaimable(struct zone *zone) { - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; } static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) @@ -571,9 +574,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); + swapcache_free(swap); } else { void (*freepage)(struct page *); void *shadow = NULL; @@ -594,7 +598,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); if (freepage != NULL) freepage(page); @@ -816,7 +819,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; @@ -840,7 +842,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page))) goto cull_mlocked; - if (!sc->may_unmap && page_mapped(page)) + if (!(sc->flags & MAY_UNMAP) && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -1014,7 +1016,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; if (!may_enter_fs) goto keep_locked; - if (!sc->may_writepage) + if (!(sc->flags & MAY_WRITEPAGE)) goto keep_locked; /* Page is dirty, try to write it out here */ @@ -1127,11 +1129,12 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } + mem_cgroup_uncharge_list(&free_pages); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; *ret_nr_congested += nr_congested; *ret_nr_unqueued_dirty += nr_unqueued_dirty; @@ -1146,7 +1149,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, struct scan_control sc = { .gfp_mask = GFP_KERNEL, .priority = DEF_PRIORITY, - .may_unmap = 1, + .flags = MAY_UNMAP, }; unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; @@ -1431,6 +1434,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1489,9 +1493,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, lru_add_drain(); - if (!sc->may_unmap) + if (!(sc->flags & MAY_UNMAP)) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) + if (!(sc->flags & MAY_WRITEPAGE)) isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); @@ -1503,7 +1507,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1538,6 +1542,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&page_list); free_hot_cold_page_list(&page_list, true); /* @@ -1593,7 +1598,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd() && + if (!(sc->flags & MAY_SKIP_CONGESTION) && !current_is_kswapd() && current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); @@ -1652,6 +1657,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec, if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1683,9 +1689,9 @@ static void shrink_active_list(unsigned long nr_to_scan, lru_add_drain(); - if (!sc->may_unmap) + if (!(sc->flags & MAY_UNMAP)) isolate_mode |= ISOLATE_UNMAPPED; - if (!sc->may_writepage) + if (!(sc->flags & MAY_WRITEPAGE)) isolate_mode |= ISOLATE_CLEAN; spin_lock_irq(&zone->lru_lock); @@ -1693,7 +1699,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1759,6 +1765,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge_list(&l_hold); free_hot_cold_page_list(&l_hold, true); } @@ -1865,8 +1872,8 @@ enum scan_balance { * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1897,7 +1904,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (get_nr_swap_pages() <= 0)) { + if (!(sc->flags & MAY_SWAP) || (get_nr_swap_pages() <= 0)) { scan_balance = SCAN_FILE; goto out; } @@ -1909,7 +1916,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !sc->swappiness) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1919,7 +1926,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && sc->swappiness) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } @@ -1962,7 +1969,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = sc->swappiness; + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -2052,7 +2059,8 @@ out: /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -2063,7 +2071,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); @@ -2241,9 +2249,10 @@ static inline bool should_continue_reclaim(struct zone *zone, } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc) { unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -2259,11 +2268,12 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) memcg = mem_cgroup_iter(root, NULL, &reclaim); do { struct lruvec *lruvec; + int swappiness; lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); - sc->swappiness = mem_cgroup_swappiness(memcg); - shrink_lruvec(lruvec, sc); + shrink_lruvec(lruvec, swappiness, sc); /* * Direct reclaim and kswapd have to scan all memory @@ -2287,20 +2297,21 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc) sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } /* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until @@ -2309,18 +2320,18 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) */ balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + if (!compaction_suitable(zone, order)) return false; return watermark_ok; @@ -2342,10 +2353,7 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2354,13 +2362,13 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long lru_pages = 0; - bool aborted_reclaim = false; struct reclaim_state *reclaim_state = current->reclaim_state; gfp_t orig_mask; struct shrink_control shrink = { .gfp_mask = sc->gfp_mask, }; enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum @@ -2391,22 +2399,24 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (sc->priority != DEF_PRIORITY && !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if ((zonelist_zone_idx(z) <= requested_highidx) - && compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->flags |= COMPACTION_READY; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2419,10 +2429,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); + if (shrink_zone(zone, sc)) + reclaimable = true; + + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } /* @@ -2445,27 +2462,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) */ sc->gfp_mask = orig_mask; - return aborted_reclaim; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (zone_reclaimable(zone)) - return false; - } - - return true; + return reclaimable; } /* @@ -2489,7 +2486,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, { unsigned long total_scanned = 0; unsigned long writeback_threshold; - bool aborted_reclaim; + bool zones_reclaimable; delayacct_freepages_start(); @@ -2500,18 +2497,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->flags & COMPACTION_READY) + break; /* * If we're getting trouble reclaiming, start doing * writepage even in laptop mode. */ if (sc->priority < DEF_PRIORITY - 2) - sc->may_writepage = 1; + sc->flags |= MAY_WRITEPAGE; /* * Try to write back as many pages as we just scanned. This @@ -2524,30 +2524,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned, WB_REASON_TRY_TO_FREE_PAGES); - sc->may_writepage = 1; + sc->flags |= MAY_WRITEPAGE; } - } while (--sc->priority >= 0 && !aborted_reclaim); + } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->flags & COMPACTION_READY) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2684,17 +2675,17 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, { unsigned long nr_reclaimed; struct scan_control sc = { - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), - .may_writepage = !laptop_mode, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .may_unmap = 1, - .may_swap = 1, + .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, .nodemask = nodemask, + .priority = DEF_PRIORITY, + .flags = MAY_UNMAP | MAY_SWAP, }; + if (!laptop_mode) + sc.flags |= MAY_WRITEPAGE; + /* * Do not enter reclaim if fatal signal was delivered while throttled. * 1 is returned so that the page allocator does not OOM kill at this @@ -2704,7 +2695,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return 1; trace_mm_vmscan_direct_reclaim_begin(order, - sc.may_writepage, + sc.flags & MAY_WRITEPAGE, gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -2722,23 +2713,22 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, - .order = 0, - .priority = 0, - .swappiness = mem_cgroup_swappiness(memcg), + .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .target_mem_cgroup = memcg, + .flags = MAY_UNMAP, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); - sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + if (!laptop_mode) + sc.flags |= MAY_WRITEPAGE; + if (!noswap) + sc.flags |= MAY_SWAP; trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order, - sc.may_writepage, + sc.flags & MAY_WRITEPAGE, sc.gfp_mask); /* @@ -2748,7 +2738,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2764,18 +2754,19 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | - (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .flags = MAY_UNMAP, }; + if (!laptop_mode) + sc.flags |= MAY_WRITEPAGE; + if (!noswap) + sc.flags |= MAY_SWAP; + /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2786,7 +2777,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, zonelist = NODE_DATA(nid)->node_zonelists; trace_mm_vmscan_memcg_reclaim_begin(0, - sc.may_writepage, + sc.flags & MAY_WRITEPAGE, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); @@ -3031,15 +3022,15 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .priority = DEF_PRIORITY, - .may_unmap = 1, - .may_swap = 1, - .may_writepage = !laptop_mode, .order = order, - .target_mem_cgroup = NULL, + .priority = DEF_PRIORITY, + .flags = MAY_UNMAP | MAY_SWAP, }; count_vm_event(PAGEOUTRUN); + if (!laptop_mode) + sc.flags |= MAY_WRITEPAGE; + do { unsigned long lru_pages = 0; unsigned long nr_attempted = 0; @@ -3120,7 +3111,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, * even in laptop mode. */ if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; + sc.flags |= MAY_WRITEPAGE; /* * Now scan the zone in the dma->highmem direction, stopping @@ -3417,14 +3408,11 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, - .may_writepage = 1, .nr_to_reclaim = nr_to_reclaim, - .hibernation_mode = 1, - .order = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, .priority = DEF_PRIORITY, + .flags = MAY_WRITEPAGE | MAY_UNMAP | MAY_SWAP | + MAY_SKIP_CONGESTION, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3604,19 +3592,22 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .flags = MAY_SWAP, }; struct shrink_control shrink = { .gfp_mask = sc.gfp_mask, }; unsigned long nr_slab_pages0, nr_slab_pages1; + if (zone_reclaim_mode & RECLAIM_WRITE) + sc.flags |= MAY_WRITEPAGE; + if (zone_reclaim_mode & RECLAIM_SWAP) + sc.flags |= MAY_UNMAP; + cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP diff --git a/mm/vmstat.c b/mm/vmstat.c index b37bd49bfd55..c1cae63e79fb 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -7,6 +7,7 @@ * zoned VM statistics * Copyright (C) 2006 Silicon Graphics, Inc., * Christoph Lameter <christoph@lameter.com> + * Copyright (C) 2008-2014 Christoph Lameter */ #include <linux/fs.h> #include <linux/mm.h> @@ -14,6 +15,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/cpu.h> +#include <linux/cpumask.h> #include <linux/vmstat.h> #include <linux/sched.h> #include <linux/math64.h> @@ -200,7 +202,7 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, continue; threshold = (*calculate_pressure)(zone); - for_each_possible_cpu(cpu) + for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } @@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) EXPORT_SYMBOL(dec_zone_page_state); #endif -static inline void fold_diff(int *diff) + +/* + * Fold a differential into the global counters. + * Returns the number of counters updated. + */ +static int fold_diff(int *diff) { int i; + int changes = 0; for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - if (diff[i]) + if (diff[i]) { atomic_long_add(diff[i], &vm_stat[i]); + changes++; + } + return changes; } /* @@ -441,12 +452,15 @@ static inline void fold_diff(int *diff) * statistics in the remote zone struct as well as the global cachelines * with the global counters. These could cause remote node cache line * bouncing and will have to be only done when necessary. + * + * The function returns the number of global counters updated. */ -static void refresh_cpu_vm_stats(void) +static int refresh_cpu_vm_stats(void) { struct zone *zone; int i; int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; + int changes = 0; for_each_populated_zone(zone) { struct per_cpu_pageset __percpu *p = zone->pageset; @@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void) continue; } - if (__this_cpu_dec_return(p->expire)) continue; - if (__this_cpu_read(p->pcp.count)) + if (__this_cpu_read(p->pcp.count)) { drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); + changes++; + } #endif } - fold_diff(global_diff); + changes += fold_diff(global_diff); + return changes; } /* @@ -763,6 +779,7 @@ const char * const vmstat_text[] = { "nr_shmem", "nr_dirtied", "nr_written", + "nr_pages_scanned", #ifdef CONFIG_NUMA "numa_hit", @@ -1067,7 +1084,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - zone->pages_scanned, + zone_page_state(zone, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); @@ -1077,10 +1094,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); seq_printf(m, - "\n protection: (%lu", + "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); + seq_printf(m, ", %ld", zone->lowmem_reserve[i]); seq_printf(m, ")" "\n pagesets"); @@ -1228,20 +1245,106 @@ static const struct file_operations proc_vmstat_file_operations = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +cpumask_var_t cpu_stat_off; static void vmstat_update(struct work_struct *w) { - refresh_cpu_vm_stats(); - schedule_delayed_work(this_cpu_ptr(&vmstat_work), + if (refresh_cpu_vm_stats()) + /* + * Counters were updated so we expect more updates + * to occur in the future. Keep on running the + * update worker thread. + */ + schedule_delayed_work(this_cpu_ptr(&vmstat_work), + round_jiffies_relative(sysctl_stat_interval)); + else { + /* + * We did not update any counters so the app may be in + * a mode where it does not cause counter updates. + * We may be uselessly running vmstat_update. + * Defer the checking for differentials to the + * shepherd thread on a different processor. + */ + int r; + /* + * Shepherd work thread does not race since it never + * changes the bit if its zero but the cpu + * online / off line code may race if + * worker threads are still allowed during + * shutdown / startup. + */ + r = cpumask_test_and_set_cpu(smp_processor_id(), + cpu_stat_off); + VM_BUG_ON(r); + } +} + +/* + * Check if the diffs for a certain cpu indicate that + * an update is needed. + */ +static bool need_update(int cpu) +{ + struct zone *zone; + + for_each_populated_zone(zone) { + struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); + + BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); + /* + * The fast way of checking if there are any vmstat diffs. + * This works because the diffs are byte sized items. + */ + if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) + return true; + + } + return false; +} + + +/* + * Shepherd worker thread that checks the + * differentials of processors that have their worker + * threads for vm statistics updates disabled because of + * inactivity. + */ +static void vmstat_shepherd(struct work_struct *w); + +static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd); + +static void vmstat_shepherd(struct work_struct *w) +{ + int cpu; + + /* Check processors whose vmstat worker threads have been disabled */ + for_each_cpu(cpu, cpu_stat_off) + if (need_update(cpu) && + cpumask_test_and_clear_cpu(cpu, cpu_stat_off)) + + schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu), + __round_jiffies_relative(sysctl_stat_interval, cpu)); + + + schedule_delayed_work(&shepherd, round_jiffies_relative(sysctl_stat_interval)); + } -static void start_cpu_timer(int cpu) +static void __init start_shepherd_timer(void) { - struct delayed_work *work = &per_cpu(vmstat_work, cpu); + int cpu; + + for_each_possible_cpu(cpu) + INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu), + vmstat_update); + + if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) + BUG(); + cpumask_copy(cpu_stat_off, cpu_online_mask); - INIT_DEFERRABLE_WORK(work, vmstat_update); - schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); + schedule_delayed_work(&shepherd, + round_jiffies_relative(sysctl_stat_interval)); } static void vmstat_cpu_dead(int node) @@ -1272,17 +1375,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: refresh_zone_stat_thresholds(); - start_cpu_timer(cpu); node_set_state(cpu_to_node(cpu), N_CPU); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); - per_cpu(vmstat_work, cpu).work.func = NULL; + if (!cpumask_test_and_set_cpu(cpu, cpu_stat_off)) + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: - start_cpu_timer(cpu); + cpumask_set_cpu(cpu, cpu_stat_off); break; case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1302,15 +1405,10 @@ static struct notifier_block vmstat_notifier = static int __init setup_vmstat(void) { #ifdef CONFIG_SMP - int cpu; - cpu_notifier_register_begin(); __register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) { - start_cpu_timer(cpu); - node_set_state(cpu_to_node(cpu), N_CPU); - } + start_shepherd_timer(); cpu_notifier_register_done(); #endif #ifdef CONFIG_PROC_FS diff --git a/mm/zbud.c b/mm/zbud.c index 01df13a7e2e1..a05790b1915e 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -51,6 +51,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/zbud.h> +#include <linux/zpool.h> /***************** * Structures @@ -113,6 +114,90 @@ struct zbud_header { }; /***************** + * zpool + ****************/ + +#ifdef CONFIG_ZPOOL + +static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle) +{ + return zpool_evict(pool, handle); +} + +static struct zbud_ops zbud_zpool_ops = { + .evict = zbud_zpool_evict +}; + +static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zbud_create_pool(gfp, &zbud_zpool_ops); +} + +static void zbud_zpool_destroy(void *pool) +{ + zbud_destroy_pool(pool); +} + +static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zbud_alloc(pool, size, gfp, handle); +} +static void zbud_zpool_free(void *pool, unsigned long handle) +{ + zbud_free(pool, handle); +} + +static int zbud_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + unsigned int total = 0; + int ret = -EINVAL; + + while (total < pages) { + ret = zbud_reclaim_page(pool, 8); + if (ret < 0) + break; + total++; + } + + if (reclaimed) + *reclaimed = total; + + return ret; +} + +static void *zbud_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + return zbud_map(pool, handle); +} +static void zbud_zpool_unmap(void *pool, unsigned long handle) +{ + zbud_unmap(pool, handle); +} + +static u64 zbud_zpool_total_size(void *pool) +{ + return zbud_get_pool_size(pool) * PAGE_SIZE; +} + +static struct zpool_driver zbud_zpool_driver = { + .type = "zbud", + .owner = THIS_MODULE, + .create = zbud_zpool_create, + .destroy = zbud_zpool_destroy, + .malloc = zbud_zpool_malloc, + .free = zbud_zpool_free, + .shrink = zbud_zpool_shrink, + .map = zbud_zpool_map, + .unmap = zbud_zpool_unmap, + .total_size = zbud_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ + +/***************** * Helpers *****************/ /* Just to make the code easier to read */ @@ -122,7 +207,7 @@ enum buddy { }; /* Converts an allocation size in bytes to size in zbud chunks */ -static int size_to_chunks(int size) +static int size_to_chunks(size_t size) { return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; } @@ -247,7 +332,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -511,11 +596,20 @@ static int __init init_zbud(void) /* Make sure the zbud header will fit in one chunk */ BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); pr_info("loaded\n"); + +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zbud_zpool_driver); +#endif + return 0; } static void __exit exit_zbud(void) { +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zbud_zpool_driver); +#endif + pr_info("unloaded\n"); } diff --git a/mm/zpool.c b/mm/zpool.c new file mode 100644 index 000000000000..e40612a1df00 --- /dev/null +++ b/mm/zpool.c @@ -0,0 +1,364 @@ +/* + * zpool memory storage api + * + * Copyright (C) 2014 Dan Streetman + * + * This is a common frontend for memory storage pool implementations. + * Typically, this is used to store compressed memory. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/list.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/module.h> +#include <linux/zpool.h> + +struct zpool { + char *type; + + struct zpool_driver *driver; + void *pool; + struct zpool_ops *ops; + + struct list_head list; +}; + +static LIST_HEAD(drivers_head); +static DEFINE_SPINLOCK(drivers_lock); + +static LIST_HEAD(pools_head); +static DEFINE_SPINLOCK(pools_lock); + +/** + * zpool_register_driver() - register a zpool implementation. + * @driver: driver to register + */ +void zpool_register_driver(struct zpool_driver *driver) +{ + spin_lock(&drivers_lock); + atomic_set(&driver->refcount, 0); + list_add(&driver->list, &drivers_head); + spin_unlock(&drivers_lock); +} +EXPORT_SYMBOL(zpool_register_driver); + +/** + * zpool_unregister_driver() - unregister a zpool implementation. + * @driver: driver to unregister. + * + * Module usage counting is used to prevent using a driver + * while/after unloading, so if this is called from module + * exit function, this should never fail; if called from + * other than the module exit function, and this returns + * failure, the driver is in use and must remain available. + */ +int zpool_unregister_driver(struct zpool_driver *driver) +{ + int ret = 0, refcount; + + spin_lock(&drivers_lock); + refcount = atomic_read(&driver->refcount); + WARN_ON(refcount < 0); + if (refcount > 0) + ret = -EBUSY; + else + list_del(&driver->list); + spin_unlock(&drivers_lock); + + return ret; +} +EXPORT_SYMBOL(zpool_unregister_driver); + +/** + * zpool_evict() - evict callback from a zpool implementation. + * @pool: pool to evict from. + * @handle: handle to evict. + * + * This can be used by zpool implementations to call the + * user's evict zpool_ops struct evict callback. + */ +int zpool_evict(void *pool, unsigned long handle) +{ + struct zpool *zpool; + + spin_lock(&pools_lock); + list_for_each_entry(zpool, &pools_head, list) { + if (zpool->pool == pool) { + spin_unlock(&pools_lock); + if (!zpool->ops || !zpool->ops->evict) + return -EINVAL; + return zpool->ops->evict(zpool, handle); + } + } + spin_unlock(&pools_lock); + + return -ENOENT; +} +EXPORT_SYMBOL(zpool_evict); + +static struct zpool_driver *zpool_get_driver(char *type) +{ + struct zpool_driver *driver; + + spin_lock(&drivers_lock); + list_for_each_entry(driver, &drivers_head, list) { + if (!strcmp(driver->type, type)) { + bool got = try_module_get(driver->owner); + + if (got) + atomic_inc(&driver->refcount); + spin_unlock(&drivers_lock); + return got ? driver : NULL; + } + } + + spin_unlock(&drivers_lock); + return NULL; +} + +static void zpool_put_driver(struct zpool_driver *driver) +{ + atomic_dec(&driver->refcount); + module_put(driver->owner); +} + +/** + * zpool_create_pool() - Create a new zpool + * @type The type of the zpool to create (e.g. zbud, zsmalloc) + * @gfp The GFP flags to use when allocating the pool. + * @ops The optional ops callback. + * + * This creates a new zpool of the specified type. The gfp flags will be + * used when allocating memory, if the implementation supports it. If the + * ops param is NULL, then the created zpool will not be shrinkable. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: New zpool on success, NULL on failure. + */ +struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) +{ + struct zpool_driver *driver; + struct zpool *zpool; + + pr_info("creating pool type %s\n", type); + + driver = zpool_get_driver(type); + + if (!driver) { + request_module(type); + driver = zpool_get_driver(type); + } + + if (!driver) { + pr_err("no driver for type %s\n", type); + return NULL; + } + + zpool = kmalloc(sizeof(*zpool), gfp); + if (!zpool) { + pr_err("couldn't create zpool - out of memory\n"); + zpool_put_driver(driver); + return NULL; + } + + zpool->type = driver->type; + zpool->driver = driver; + zpool->pool = driver->create(gfp, ops); + zpool->ops = ops; + + if (!zpool->pool) { + pr_err("couldn't create %s pool\n", type); + zpool_put_driver(driver); + kfree(zpool); + return NULL; + } + + pr_info("created %s pool\n", type); + + spin_lock(&pools_lock); + list_add(&zpool->list, &pools_head); + spin_unlock(&pools_lock); + + return zpool; +} + +/** + * zpool_destroy_pool() - Destroy a zpool + * @pool The zpool to destroy. + * + * Implementations must guarantee this to be thread-safe, + * however only when destroying different pools. The same + * pool should only be destroyed once, and should not be used + * after it is destroyed. + * + * This destroys an existing zpool. The zpool should not be in use. + */ +void zpool_destroy_pool(struct zpool *zpool) +{ + pr_info("destroying pool type %s\n", zpool->type); + + spin_lock(&pools_lock); + list_del(&zpool->list); + spin_unlock(&pools_lock); + zpool->driver->destroy(zpool->pool); + zpool_put_driver(zpool->driver); + kfree(zpool); +} + +/** + * zpool_get_type() - Get the type of the zpool + * @pool The zpool to check + * + * This returns the type of the pool. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: The type of zpool. + */ +char *zpool_get_type(struct zpool *zpool) +{ + return zpool->type; +} + +/** + * zpool_malloc() - Allocate memory + * @pool The zpool to allocate from. + * @size The amount of memory to allocate. + * @gfp The GFP flags to use when allocating memory. + * @handle Pointer to the handle to set + * + * This allocates the requested amount of memory from the pool. + * The gfp flags will be used when allocating memory, if the + * implementation supports it. The provided @handle will be + * set to the allocated object handle. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error. + */ +int zpool_malloc(struct zpool *zpool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + return zpool->driver->malloc(zpool->pool, size, gfp, handle); +} + +/** + * zpool_free() - Free previously allocated memory + * @pool The zpool that allocated the memory. + * @handle The handle to the memory to free. + * + * This frees previously allocated memory. This does not guarantee + * that the pool will actually free memory, only that the memory + * in the pool will become available for use by the pool. + * + * Implementations must guarantee this to be thread-safe, + * however only when freeing different handles. The same + * handle should only be freed once, and should not be used + * after freeing. + */ +void zpool_free(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->free(zpool->pool, handle); +} + +/** + * zpool_shrink() - Shrink the pool size + * @pool The zpool to shrink. + * @pages The number of pages to shrink the pool. + * @reclaimed The number of pages successfully evicted. + * + * This attempts to shrink the actual memory size of the pool + * by evicting currently used handle(s). If the pool was + * created with no zpool_ops, or the evict call fails for any + * of the handles, this will fail. If non-NULL, the @reclaimed + * parameter will be set to the number of pages reclaimed, + * which may be more than the number of pages requested. + * + * Implementations must guarantee this to be thread-safe. + * + * Returns: 0 on success, negative value on error/failure. + */ +int zpool_shrink(struct zpool *zpool, unsigned int pages, + unsigned int *reclaimed) +{ + return zpool->driver->shrink(zpool->pool, pages, reclaimed); +} + +/** + * zpool_map_handle() - Map a previously allocated handle into memory + * @pool The zpool that the handle was allocated from + * @handle The handle to map + * @mm How the memory should be mapped + * + * This maps a previously allocated handle into memory. The @mm + * param indicates to the implementation how the memory will be + * used, i.e. read-only, write-only, read-write. If the + * implementation does not support it, the memory will be treated + * as read-write. + * + * This may hold locks, disable interrupts, and/or preemption, + * and the zpool_unmap_handle() must be called to undo those + * actions. The code that uses the mapped handle should complete + * its operatons on the mapped handle memory quickly and unmap + * as soon as possible. As the implementation may use per-cpu + * data, multiple handles should not be mapped concurrently on + * any cpu. + * + * Returns: A pointer to the handle's mapped memory area. + */ +void *zpool_map_handle(struct zpool *zpool, unsigned long handle, + enum zpool_mapmode mapmode) +{ + return zpool->driver->map(zpool->pool, handle, mapmode); +} + +/** + * zpool_unmap_handle() - Unmap a previously mapped handle + * @pool The zpool that the handle was allocated from + * @handle The handle to unmap + * + * This unmaps a previously mapped handle. Any locks or other + * actions that the implementation took in zpool_map_handle() + * will be undone here. The memory area returned from + * zpool_map_handle() should no longer be used after this. + */ +void zpool_unmap_handle(struct zpool *zpool, unsigned long handle) +{ + zpool->driver->unmap(zpool->pool, handle); +} + +/** + * zpool_get_total_size() - The total size of the pool + * @pool The zpool to check + * + * This returns the total size in bytes of the pool. + * + * Returns: Total size of the zpool in bytes. + */ +u64 zpool_get_total_size(struct zpool *zpool) +{ + return zpool->driver->total_size(zpool->pool); +} + +static int __init init_zpool(void) +{ + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zpool(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zpool); +module_exit(exit_zpool); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Dan Streetman <ddstreet@ieee.org>"); +MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index fe78189624cf..4e2fc83cb394 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -92,6 +92,7 @@ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/zsmalloc.h> +#include <linux/zpool.h> /* * This must be power of 2 and greater than of equal to sizeof(link_free). @@ -240,6 +241,81 @@ struct mapping_area { enum zs_mapmode vm_mm; /* mapping mode */ }; +/* zpool driver */ + +#ifdef CONFIG_ZPOOL + +static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) +{ + return zs_create_pool(gfp); +} + +static void zs_zpool_destroy(void *pool) +{ + zs_destroy_pool(pool); +} + +static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, + unsigned long *handle) +{ + *handle = zs_malloc(pool, size); + return *handle ? 0 : -1; +} +static void zs_zpool_free(void *pool, unsigned long handle) +{ + zs_free(pool, handle); +} + +static int zs_zpool_shrink(void *pool, unsigned int pages, + unsigned int *reclaimed) +{ + return -EINVAL; +} + +static void *zs_zpool_map(void *pool, unsigned long handle, + enum zpool_mapmode mm) +{ + enum zs_mapmode zs_mm; + + switch (mm) { + case ZPOOL_MM_RO: + zs_mm = ZS_MM_RO; + break; + case ZPOOL_MM_WO: + zs_mm = ZS_MM_WO; + break; + case ZPOOL_MM_RW: /* fallthru */ + default: + zs_mm = ZS_MM_RW; + break; + } + + return zs_map_object(pool, handle, zs_mm); +} +static void zs_zpool_unmap(void *pool, unsigned long handle) +{ + zs_unmap_object(pool, handle); +} + +static u64 zs_zpool_total_size(void *pool) +{ + return zs_get_total_size_bytes(pool); +} + +static struct zpool_driver zs_zpool_driver = { + .type = "zsmalloc", + .owner = THIS_MODULE, + .create = zs_zpool_create, + .destroy = zs_zpool_destroy, + .malloc = zs_zpool_malloc, + .free = zs_zpool_free, + .shrink = zs_zpool_shrink, + .map = zs_zpool_map, + .unmap = zs_zpool_unmap, + .total_size = zs_zpool_total_size, +}; + +#endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area); @@ -690,7 +766,7 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, &pages)); + BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); area->vm_addr = area->vm->addr; return area->vm_addr + off; } @@ -814,6 +890,10 @@ static void zs_exit(void) { int cpu; +#ifdef CONFIG_ZPOOL + zpool_unregister_driver(&zs_zpool_driver); +#endif + cpu_notifier_register_begin(); for_each_online_cpu(cpu) @@ -840,6 +920,10 @@ static int zs_init(void) cpu_notifier_register_done(); +#ifdef CONFIG_ZPOOL + zpool_register_driver(&zs_zpool_driver); +#endif + return 0; fail: zs_exit(); diff --git a/mm/zswap.c b/mm/zswap.c index 008388fe7b0f..ea064c1a09ba 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -34,7 +34,7 @@ #include <linux/swap.h> #include <linux/crypto.h> #include <linux/mempool.h> -#include <linux/zbud.h> +#include <linux/zpool.h> #include <linux/mm_types.h> #include <linux/page-flags.h> @@ -45,8 +45,8 @@ /********************************* * statistics **********************************/ -/* Number of memory pages used by the compressed pool */ -static u64 zswap_pool_pages; +/* Total bytes used by the compressed storage */ +static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); @@ -89,8 +89,13 @@ static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); -/* zbud_pool is shared by all of zswap backend */ -static struct zbud_pool *zswap_pool; +/* Compressed storage to use */ +#define ZSWAP_ZPOOL_DEFAULT "zbud" +static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; +module_param_named(zpool, zswap_zpool_type, charp, 0444); + +/* zpool is shared by all of zswap backend */ +static struct zpool *zswap_pool; /********************************* * compression functions @@ -168,7 +173,7 @@ static void zswap_comp_exit(void) * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * offset - the swap offset for the entry. Index into the red-black tree. - * handle - zbud allocation handle that stores the compressed page data + * handle - zpool allocation handle that stores the compressed page data * length - the length in bytes of the compressed page data. Needed during * decompression */ @@ -207,7 +212,7 @@ static int zswap_entry_cache_create(void) return zswap_entry_cache == NULL; } -static void zswap_entry_cache_destory(void) +static void __init zswap_entry_cache_destroy(void) { kmem_cache_destroy(zswap_entry_cache); } @@ -284,15 +289,15 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) } /* - * Carries out the common pattern of freeing and entry's zbud allocation, + * Carries out the common pattern of freeing and entry's zpool allocation, * freeing the entry itself, and decrementing the number of stored pages. */ static void zswap_free_entry(struct zswap_entry *entry) { - zbud_free(zswap_pool, entry->handle); + zpool_free(zswap_pool, entry->handle); zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); } /* caller must hold the tree lock */ @@ -409,7 +414,7 @@ cleanup: static bool zswap_is_full(void) { return totalram_pages * zswap_max_pool_percent / 100 < - zswap_pool_pages; + DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE); } /********************************* @@ -502,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * add_to_swap_cache() doesn't return -EEXIST, so we can safely * clear SWAP_HAS_CACHE flag. */ - swapcache_free(entry, NULL); + swapcache_free(entry); } while (err != -ENOMEM); if (new_page) @@ -525,7 +530,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, * the swap cache, the compressed version stored by zswap can be * freed. */ -static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) { struct zswap_header *zhdr; swp_entry_t swpentry; @@ -541,9 +546,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) }; /* extract swpentry from data */ - zhdr = zbud_map(pool, handle); + zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO); swpentry = zhdr->swpentry; /* here */ - zbud_unmap(pool, handle); + zpool_unmap_handle(pool, handle); tree = zswap_trees[swp_type(swpentry)]; offset = swp_offset(swpentry); @@ -573,13 +578,13 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) case ZSWAP_SWAPCACHE_NEW: /* page is locked */ /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); BUG_ON(dlen != PAGE_SIZE); @@ -652,7 +657,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* reclaim space if needed */ if (zswap_is_full()) { zswap_pool_limit_hit++; - if (zbud_reclaim_page(zswap_pool, 8)) { + if (zpool_shrink(zswap_pool, 1, NULL)) { zswap_reject_reclaim_fail++; ret = -ENOMEM; goto reject; @@ -679,7 +684,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* store */ len = dlen + sizeof(struct zswap_header); - ret = zbud_alloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, + ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN, &handle); if (ret == -ENOSPC) { zswap_reject_compress_poor++; @@ -689,11 +694,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto freepage; } - zhdr = zbud_map(zswap_pool, handle); + zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW); zhdr->swpentry = swp_entry(type, offset); buf = (u8 *)(zhdr + 1); memcpy(buf, dst, dlen); - zbud_unmap(zswap_pool, handle); + zpool_unmap_handle(zswap_pool, handle); put_cpu_var(zswap_dstmem); /* populate entry */ @@ -716,7 +721,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, /* update stats */ atomic_inc(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(zswap_pool); + zswap_pool_total_size = zpool_get_total_size(zswap_pool); return 0; @@ -752,13 +757,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zbud_map(zswap_pool, entry->handle) + - sizeof(struct zswap_header); + src = (u8 *)zpool_map_handle(zswap_pool, entry->handle, + ZPOOL_MM_RO) + sizeof(struct zswap_header); dst = kmap_atomic(page); ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, dst, &dlen); kunmap_atomic(dst); - zbud_unmap(zswap_pool, entry->handle); + zpool_unmap_handle(zswap_pool, entry->handle); BUG_ON(ret); spin_lock(&tree->lock); @@ -811,7 +816,7 @@ static void zswap_frontswap_invalidate_area(unsigned type) zswap_trees[type] = NULL; } -static struct zbud_ops zswap_zbud_ops = { +static struct zpool_ops zswap_zpool_ops = { .evict = zswap_writeback_entry }; @@ -869,8 +874,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_u64("duplicate_entry", S_IRUGO, zswap_debugfs_root, &zswap_duplicate_entry); - debugfs_create_u64("pool_pages", S_IRUGO, - zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_u64("pool_total_size", S_IRUGO, + zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); @@ -895,16 +900,26 @@ static void __exit zswap_debugfs_exit(void) { } **********************************/ static int __init init_zswap(void) { + gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + if (!zswap_enabled) return 0; pr_info("loading zswap\n"); - zswap_pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); + if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { + pr_info("%s zpool not available\n", zswap_zpool_type); + zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; + zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, + &zswap_zpool_ops); + } if (!zswap_pool) { - pr_err("zbud pool creation failed\n"); + pr_err("%s zpool not available\n", zswap_zpool_type); + pr_err("zpool creation failed\n"); goto error; } + pr_info("using %s pool\n", zswap_zpool_type); if (zswap_entry_cache_create()) { pr_err("entry cache creation failed\n"); @@ -926,9 +941,9 @@ static int __init init_zswap(void) pcpufail: zswap_comp_exit(); compfail: - zswap_entry_cache_destory(); + zswap_entry_cache_destroy(); cachefail: - zbud_destroy_pool(zswap_pool); + zpool_destroy_pool(zswap_pool); error: return -ENOMEM; } diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index f14e54a05691..16dd40910c65 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -184,7 +184,7 @@ static bool batadv_frag_insert_packet(struct batadv_orig_node *orig_node, /* Reached the end of the list, so insert after 'frag_entry_curr'. */ if (likely(frag_entry_curr)) { - hlist_add_after(&frag_entry_curr->list, &frag_entry_new->list); + hlist_add_behind(&frag_entry_new->list, &frag_entry_curr->list); chain->size += skb->len - hdr_size; chain->timestamp = jiffies; ret = true; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index b4845f4b2bb4..7751c92c8c57 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1174,7 +1174,7 @@ static void br_multicast_add_router(struct net_bridge *br, } if (slot) - hlist_add_after_rcu(slot, &port->rlist); + hlist_add_behind_rcu(&port->rlist, slot); else hlist_add_head_rcu(&port->rlist, &br->router_list); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 5afeb5aa4c7c..e9cb2588e416 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -940,7 +940,7 @@ static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new) last = li; } if (last) - hlist_add_after_rcu(&last->hlist, &new->hlist); + hlist_add_behind_rcu(&new->hlist, &last->hlist); else hlist_add_before_rcu(&new->hlist, &li->hlist); } diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index 731e1e1722d9..fd0dc47f471d 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -277,7 +277,7 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) last = p; } if (last) - hlist_add_after_rcu(&last->list, &newp->list); + hlist_add_behind_rcu(&newp->list, &last->list); else hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); out: diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0525d78ba328..beeed602aeb3 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -389,7 +389,7 @@ redo: if (h != h0) continue; hlist_del(&pol->bydst); - hlist_add_after(entry0, &pol->bydst); + hlist_add_behind(&pol->bydst, entry0); } entry0 = &pol->bydst; } @@ -654,7 +654,7 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) break; } if (newpos) - hlist_add_after(newpos, &policy->bydst); + hlist_add_behind(&policy->bydst, newpos); else hlist_add_head(&policy->bydst, chain); xfrm_pol_hold(policy); diff --git a/scripts/.gitignore b/scripts/.gitignore index fb070fa1038f..5ecfe93f2028 100644 --- a/scripts/.gitignore +++ b/scripts/.gitignore @@ -4,7 +4,6 @@ conmakehash kallsyms pnmtologo -bin2c unifdef ihex2fw recordmcount diff --git a/scripts/Makefile b/scripts/Makefile index 890df5c6adfb..72902b5f2721 100644 --- a/scripts/Makefile +++ b/scripts/Makefile @@ -13,7 +13,6 @@ HOST_EXTRACFLAGS += -I$(srctree)/tools/include hostprogs-$(CONFIG_KALLSYMS) += kallsyms hostprogs-$(CONFIG_LOGO) += pnmtologo hostprogs-$(CONFIG_VT) += conmakehash -hostprogs-$(CONFIG_IKCONFIG) += bin2c hostprogs-$(BUILD_C_RECORDMCOUNT) += recordmcount hostprogs-$(CONFIG_BUILDTIME_EXTABLE_SORT) += sortextable hostprogs-$(CONFIG_ASN1) += asn1_compiler diff --git a/scripts/basic/.gitignore b/scripts/basic/.gitignore index a776371a3502..9528ec9e5adc 100644 --- a/scripts/basic/.gitignore +++ b/scripts/basic/.gitignore @@ -1 +1,2 @@ fixdep +bin2c diff --git a/scripts/basic/Makefile b/scripts/basic/Makefile index 4fcef87bb875..ec10d9345bc2 100644 --- a/scripts/basic/Makefile +++ b/scripts/basic/Makefile @@ -9,6 +9,7 @@ # fixdep: Used to generate dependency information during build process hostprogs-y := fixdep +hostprogs-$(CONFIG_BUILD_BIN2C) += bin2c always := $(hostprogs-y) # fixdep is needed to compile other host programs diff --git a/scripts/bin2c.c b/scripts/basic/bin2c.c index 96dd2bcbb407..af187e695345 100644 --- a/scripts/bin2c.c +++ b/scripts/basic/bin2c.c @@ -11,7 +11,7 @@ int main(int argc, char *argv[]) { - int ch, total=0; + int ch, total = 0; if (argc > 1) printf("const char %s[] %s=\n", @@ -19,10 +19,9 @@ int main(int argc, char *argv[]) do { printf("\t\""); - while ((ch = getchar()) != EOF) - { + while ((ch = getchar()) != EOF) { total++; - printf("\\x%02x",ch); + printf("\\x%02x", ch); if (total % 16 == 0) break; } diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 182be0f12407..dc72a9b3172d 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -435,7 +435,7 @@ sub build_types { }x; $Type = qr{ $NonptrType - (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*|\[\])+|(?:\s*\[\s*\])+)? + (?:(?:\s|\*|\[\])+\s*const|(?:\s|\*\s*(?:const\s*)?|\[\])+|(?:\s*\[\s*\])+)? (?:\s+$Inline|\s+$Modifier)* }x; $Declare = qr{(?:$Storage\s+(?:$Inline\s+)?)?$Type}; @@ -550,11 +550,43 @@ sub seed_camelcase_includes { } } +sub git_commit_info { + my ($commit, $id, $desc) = @_; + + return ($id, $desc) if ((which("git") eq "") || !(-e ".git")); + + my $output = `git log --no-color --format='%H %s' -1 $commit 2>&1`; + $output =~ s/^\s*//gm; + my @lines = split("\n", $output); + + if ($lines[0] =~ /^error: short SHA1 $commit is ambiguous\./) { +# Maybe one day convert this block of bash into something that returns +# all matching commit ids, but it's very slow... +# +# echo "checking commits $1..." +# git rev-list --remotes | grep -i "^$1" | +# while read line ; do +# git log --format='%H %s' -1 $line | +# echo "commit $(cut -c 1-12,41-)" +# done + } elsif ($lines[0] =~ /^fatal: ambiguous argument '$commit': unknown revision or path not in the working tree\./) { + } else { + $id = substr($lines[0], 0, 12); + $desc = substr($lines[0], 41); + } + + return ($id, $desc); +} + $chk_signoff = 0 if ($file); my @rawlines = (); my @lines = (); my @fixed = (); +my @fixed_inserted = (); +my @fixed_deleted = (); +my $fixlinenr = -1; + my $vname; for my $filename (@ARGV) { my $FILE; @@ -583,6 +615,9 @@ for my $filename (@ARGV) { @rawlines = (); @lines = (); @fixed = (); + @fixed_inserted = (); + @fixed_deleted = (); + $fixlinenr = -1; } exit($exit); @@ -674,6 +709,18 @@ sub format_email { return $formatted_email; } +sub which { + my ($bin) = @_; + + foreach my $path (split(/:/, $ENV{PATH})) { + if (-e "$path/$bin") { + return "$path/$bin"; + } + } + + return ""; +} + sub which_conf { my ($conf) = @_; @@ -1483,6 +1530,90 @@ sub report_dump { our @report; } +sub fixup_current_range { + my ($lineRef, $offset, $length) = @_; + + if ($$lineRef =~ /^\@\@ -\d+,\d+ \+(\d+),(\d+) \@\@/) { + my $o = $1; + my $l = $2; + my $no = $o + $offset; + my $nl = $l + $length; + $$lineRef =~ s/\+$o,$l \@\@/\+$no,$nl \@\@/; + } +} + +sub fix_inserted_deleted_lines { + my ($linesRef, $insertedRef, $deletedRef) = @_; + + my $range_last_linenr = 0; + my $delta_offset = 0; + + my $old_linenr = 0; + my $new_linenr = 0; + + my $next_insert = 0; + my $next_delete = 0; + + my @lines = (); + + my $inserted = @{$insertedRef}[$next_insert++]; + my $deleted = @{$deletedRef}[$next_delete++]; + + foreach my $old_line (@{$linesRef}) { + my $save_line = 1; + my $line = $old_line; #don't modify the array + if ($line =~ /^(?:\+\+\+\|\-\-\-)\s+\S+/) { #new filename + $delta_offset = 0; + } elsif ($line =~ /^\@\@ -\d+,\d+ \+\d+,\d+ \@\@/) { #new hunk + $range_last_linenr = $new_linenr; + fixup_current_range(\$line, $delta_offset, 0); + } + + while (defined($deleted) && ${$deleted}{'LINENR'} == $old_linenr) { + $deleted = @{$deletedRef}[$next_delete++]; + $save_line = 0; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset--, -1); + } + + while (defined($inserted) && ${$inserted}{'LINENR'} == $old_linenr) { + push(@lines, ${$inserted}{'LINE'}); + $inserted = @{$insertedRef}[$next_insert++]; + $new_linenr++; + fixup_current_range(\$lines[$range_last_linenr], $delta_offset++, 1); + } + + if ($save_line) { + push(@lines, $line); + $new_linenr++; + } + + $old_linenr++; + } + + return @lines; +} + +sub fix_insert_line { + my ($linenr, $line) = @_; + + my $inserted = { + LINENR => $linenr, + LINE => $line, + }; + push(@fixed_inserted, $inserted); +} + +sub fix_delete_line { + my ($linenr, $line) = @_; + + my $deleted = { + LINENR => $linenr, + LINE => $line, + }; + + push(@fixed_deleted, $deleted); +} + sub ERROR { my ($type, $msg) = @_; @@ -1637,11 +1768,13 @@ sub process { my $signoff = 0; my $is_patch = 0; - my $in_header_lines = 1; + my $in_header_lines = $file ? 0 : 1; my $in_commit_log = 0; #Scanning lines before patch - + my $reported_maintainer_file = 0; my $non_utf8_charset = 0; + my $last_blank_line = 0; + our @report = (); our $cnt_lines = 0; our $cnt_error = 0; @@ -1759,8 +1892,10 @@ sub process { $realcnt = 0; $linenr = 0; + $fixlinenr = -1; foreach my $line (@lines) { $linenr++; + $fixlinenr++; my $sline = $line; #copy of $line $sline =~ s/$;/ /g; #with comments as spaces @@ -1891,7 +2026,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Do not use whitespace before $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -1899,7 +2034,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "'$ucfirst_sign_off' is the preferred signature form\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } @@ -1908,7 +2043,7 @@ sub process { if (WARN("BAD_SIGN_OFF", "Use a single space after $ucfirst_sign_off\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] = + $fixed[$fixlinenr] = "$ucfirst_sign_off $email"; } } @@ -1956,6 +2091,31 @@ sub process { "Remove Gerrit Change-Id's before submitting upstream.\n" . $herecurr); } +# Check for improperly formed commit descriptions + if ($in_commit_log && + $line =~ /\bcommit\s+[0-9a-f]{5,}/i && + $line !~ /\b[Cc]ommit [0-9a-f]{12,16} \("/) { + $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i; + my $init_char = $1; + my $orig_commit = lc($2); + my $id = '01234567890ab'; + my $desc = 'commit description'; + ($id, $desc) = git_commit_info($orig_commit, $id, $desc); + ERROR("GIT_COMMIT_ID", + "Please use 12 to 16 chars for the git commit ID like: '${init_char}ommit $id (\"$desc\")'\n" . $herecurr); + } + +# Check for added, moved or deleted files + if (!$reported_maintainer_file && !$in_commit_log && + ($line =~ /^(?:new|deleted) file mode\s*\d+\s*$/ || + $line =~ /^rename (?:from|to) [\w\/\.\-]+\s*$/ || + ($line =~ /\{\s*([\w\/\.\-]*)\s*\=\>\s*([\w\/\.\-]*)\s*\}/ && + (defined($1) || defined($2))))) { + $reported_maintainer_file = 1; + WARN("FILE_PATH_CHANGES", + "added, moved or deleted file(s), does MAINTAINERS need updating?\n" . $herecurr); + } + # Check for wrappage within a valid hunk of the file if ($realcnt != 0 && $line !~ m{^(?:\+|-| |\\ No newline|$)}) { ERROR("CORRUPTED_PATCH", @@ -1993,7 +2153,8 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { + !($rawline =~ /^\s+\S/ || + $rawline =~ /^(commit\b|from\b|[\w-]+:).*$/i)) { $in_header_lines = 0; $in_commit_log = 1; } @@ -2021,14 +2182,14 @@ sub process { if (ERROR("DOS_LINE_ENDINGS", "DOS line endings\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/[\s\015]+$//; + $fixed[$fixlinenr] =~ s/[\s\015]+$//; } } elsif ($rawline =~ /^\+.*\S\s+$/ || $rawline =~ /^\+\s+$/) { my $herevet = "$here\n" . cat_vet($rawline) . "\n"; if (ERROR("TRAILING_WHITESPACE", "trailing whitespace\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } $rpt_cleaners = 1; @@ -2049,7 +2210,7 @@ sub process { # Only applies when adding the entry originally, after that we do not have # sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /.\s*config\s+/) { + $line =~ /^\+\s*config\s+/) { my $length = 0; my $cnt = $realcnt; my $ln = $linenr + 1; @@ -2062,10 +2223,11 @@ sub process { $is_end = $lines[$ln - 1] =~ /^\+/; next if ($f =~ /^-/); + last if (!$file && $f =~ /^\@\@/); - if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) { + if ($lines[$ln - 1] =~ /^\+\s*(?:bool|tristate)\s*\"/) { $is_start = 1; - } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) { + } elsif ($lines[$ln - 1] =~ /^\+\s*(?:---)?help(?:---)?$/) { $length = -1; } @@ -2166,7 +2328,7 @@ sub process { if (WARN("QUOTED_WHITESPACE_BEFORE_NEWLINE", "unnecessary whitespace before a quoted newline\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; + $fixed[$fixlinenr] =~ s/^(\+.*\".*)\s+\\n/$1\\n/; } } @@ -2203,7 +2365,7 @@ sub process { if (ERROR("CODE_INDENT", "code indent should use tabs where possible\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } @@ -2213,9 +2375,9 @@ sub process { if (WARN("SPACE_BEFORE_TAB", "please, no space before tabs\n" . $herevet) && $fix) { - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) {8,8}+\t/$1\t\t/) {} - while ($fixed[$linenr - 1] =~ + while ($fixed[$fixlinenr] =~ s/(^\+.*) +\t/$1\t/) {} } } @@ -2249,19 +2411,19 @@ sub process { if (CHK("PARENTHESIS_ALIGNMENT", "Alignment should match open parenthesis\n" . $hereprev) && $fix && $line =~ /^\+/) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^\+[ \t]*/\+$goodtabindent/; } } } } - if ($line =~ /^\+.*\*[ \t]*\)[ \t]+(?!$Assignment|$Arithmetic)/) { + if ($line =~ /^\+.*\(\s*$Type\s*\)[ \t]+(?!$Assignment|$Arithmetic)/) { if (CHK("SPACING", - "No space is necessary after a cast\n" . $hereprev) && + "No space is necessary after a cast\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ - s/^(\+.*\*[ \t]*\))[ \t]+/$1/; + $fixed[$fixlinenr] =~ + s/(\(\s*$Type\s*\))[ \t]+/$1/; } } @@ -2291,10 +2453,44 @@ sub process { "networking block comments put the trailing */ on a separate line\n" . $herecurr); } +# check for missing blank lines after struct/union declarations +# with exceptions for various attributes and macros + if ($prevline =~ /^[\+ ]};?\s*$/ && + $line =~ /^\+/ && + !($line =~ /^\+\s*$/ || + $line =~ /^\+\s*EXPORT_SYMBOL/ || + $line =~ /^\+\s*MODULE_/i || + $line =~ /^\+\s*\#\s*(?:end|elif|else)/ || + $line =~ /^\+[a-z_]*init/ || + $line =~ /^\+\s*(?:static\s+)?[A-Z_]*ATTR/ || + $line =~ /^\+\s*DECLARE/ || + $line =~ /^\+\s*__setup/)) { + if (CHK("LINE_SPACING", + "Please use a blank line after function/struct/union/enum declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } + } + +# check for multiple consecutive blank lines + if ($prevline =~ /^[\+ ]\s*$/ && + $line =~ /^\+\s*$/ && + $last_blank_line != ($linenr - 1)) { + if (CHK("LINE_SPACING", + "Please don't use multiple blank lines\n" . $hereprev) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + } + + $last_blank_line = $linenr; + } + # check for missing blank lines after declarations if ($sline =~ /^\+\s+\S/ && #Not at char 1 # actual declarations ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $prevline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros @@ -2307,6 +2503,8 @@ sub process { $prevline =~ /(?:\{\s*|\\)$/) && # looks like a declaration !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # function pointer declarations + $sline =~ /^\+\s+$Declare\s*\(\s*\*\s*$Ident\s*\)\s*[=,;:\[\(]/ || # foo bar; where foo is some local typedef or #define $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || # known declaration macros @@ -2321,8 +2519,11 @@ sub process { $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && # indentation of previous and current line are the same (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { - WARN("SPACING", - "Missing a blank line after declarations\n" . $hereprev); + if (WARN("LINE_SPACING", + "Missing a blank line after declarations\n" . $hereprev) && + $fix) { + fix_insert_line($fixlinenr, "\+"); + } } # check for spaces at the beginning of a line. @@ -2335,13 +2536,33 @@ sub process { if (WARN("LEADING_SPACE", "please, no spaces at the start of a line\n" . $herevet) && $fix) { - $fixed[$linenr - 1] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; + $fixed[$fixlinenr] =~ s/^\+([ \t]+)/"\+" . tabify($1)/e; } } # check we are in a valid C source file if not then ignore this hunk next if ($realfile !~ /\.(h|c)$/); +# check indentation of any line with a bare else +# if the previous line is a break or return and is indented 1 tab more... + if ($sline =~ /^\+([\t]+)(?:}[ \t]*)?else(?:[ \t]*{)?\s*$/) { + my $tabs = length($1) + 1; + if ($prevline =~ /^\+\t{$tabs,$tabs}(?:break|return)\b/) { + WARN("UNNECESSARY_ELSE", + "else is not generally useful after a break or return\n" . $hereprev); + } + } + +# check indentation of a line with a break; +# if the previous line is a goto or return and is indented the same # of tabs + if ($sline =~ /^\+([\t]+)break\s*;\s*$/) { + my $tabs = $1; + if ($prevline =~ /^\+$tabs(?:goto|return)\b/) { + WARN("UNNECESSARY_BREAK", + "break is not useful after a goto or return\n" . $hereprev); + } + } + # discourage the addition of CONFIG_EXPERIMENTAL in #if(def). if ($line =~ /^\+\s*\#\s*if.*\bCONFIG_EXPERIMENTAL\b/) { WARN("CONFIG_EXPERIMENTAL", @@ -2477,7 +2698,7 @@ sub process { # if/while/etc brace do not go on next line, unless defining a do while loop, # or if that brace on the next line is for something else - if ($line =~ /(.*)\b((?:if|while|for|switch)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { + if ($line =~ /(.*)\b((?:if|while|for|switch|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b|else\b)/ && $line !~ /^.\s*\#/) { my $pre_ctx = "$1$2"; my ($level, @ctx) = ctx_statement_level($linenr, $realcnt, 0); @@ -2504,7 +2725,7 @@ sub process { #print "realcnt<$realcnt> ctx_cnt<$ctx_cnt>\n"; #print "pre<$pre_ctx>\nline<$line>\nctx<$ctx>\nnext<$lines[$ctx_ln - 1]>\n"; - if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln -1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { + if ($ctx !~ /{\s*/ && defined($lines[$ctx_ln - 1]) && $lines[$ctx_ln - 1] =~ /^\+\s*{/) { ERROR("OPEN_BRACE", "that open brace { should be on the previous line\n" . "$here\n$ctx\n$rawlines[$ctx_ln - 1]\n"); @@ -2523,7 +2744,7 @@ sub process { } # Check relative indent for conditionals and blocks. - if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { + if ($line =~ /\b(?:(?:if|while|for|(?:[a-z_]+|)for_each[a-z_]+)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0) if (!defined $stat); @@ -2654,8 +2875,18 @@ sub process { # check for initialisation to aggregates open brace on the next line if ($line =~ /^.\s*{/ && $prevline =~ /(?:^|[^=])=\s*$/) { - ERROR("OPEN_BRACE", - "that open brace { should be on the previous line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "that open brace { should be on the previous line\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + $fixedline =~ s/\s*=\s*$/ = {/; + fix_insert_line($fixlinenr, $fixedline); + $fixedline = $line; + $fixedline =~ s/^(.\s*){\s*/$1/; + fix_insert_line($fixlinenr, $fixedline); + } } # @@ -2680,10 +2911,10 @@ sub process { if (ERROR("C99_COMMENTS", "do not use C99 // comments\n" . $herecurr) && $fix) { - my $line = $fixed[$linenr - 1]; + my $line = $fixed[$fixlinenr]; if ($line =~ /\/\/(.*)$/) { my $comment = trim($1); - $fixed[$linenr - 1] =~ s@\/\/(.*)$@/\* $comment \*/@; + $fixed[$fixlinenr] =~ s@\/\/(.*)$@/\* $comment \*/@; } } } @@ -2742,7 +2973,7 @@ sub process { "do not initialise globals to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/($Type\s*$Ident\s*(?:\s+$Modifier))*\s*=\s*(0|NULL|false)\s*;/$1;/; } } # check for static initialisers. @@ -2751,7 +2982,7 @@ sub process { "do not initialise statics to 0 or NULL\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; + $fixed[$fixlinenr] =~ s/(\bstatic\s.*?)\s*=\s*(0|NULL|false)\s*;/$1;/; } } @@ -2781,7 +3012,7 @@ sub process { if (ERROR("FUNCTION_WITHOUT_ARGS", "Bad function definition - $1() should probably be $1(void)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; + $fixed[$fixlinenr] =~ s/(\b($Type)\s+($Ident))\s*\(\s*\)/$2 $3(void)/; } } @@ -2790,7 +3021,7 @@ sub process { if (WARN("DEFINE_PCI_DEVICE_TABLE", "Prefer struct pci_device_id over deprecated DEFINE_PCI_DEVICE_TABLE\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; + $fixed[$fixlinenr] =~ s/\b(?:static\s+|)DEFINE_PCI_DEVICE_TABLE\s*\(\s*(\w+)\s*\)\s*=\s*/static const struct pci_device_id $1\[\] = /; } } @@ -2827,7 +3058,7 @@ sub process { my $sub_from = $ident; my $sub_to = $ident; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -2855,7 +3086,7 @@ sub process { my $sub_from = $match; my $sub_to = $match; $sub_to =~ s/\Q$from\E/$to/; - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s@\Q$sub_from\E@$sub_to@; } } @@ -2917,7 +3148,7 @@ sub process { if (WARN("PREFER_PR_LEVEL", "Prefer pr_warn(... to pr_warning(...\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\bpr_warning\b/pr_warn/; } } @@ -2933,17 +3164,40 @@ sub process { # function brace can't be on same line, except for #defines of do while, # or if closed on same line - if (($line=~/$Type\s*$Ident\(.*\).*\s{/) and + if (($line=~/$Type\s*$Ident\(.*\).*\s*{/) and !($line=~/\#\s*define.*do\s{/) and !($line=~/}/)) { - ERROR("OPEN_BRACE", - "open brace '{' following function declarations go on the next line\n" . $herecurr); + if (ERROR("OPEN_BRACE", + "open brace '{' following function declarations go on the next line\n" . $herecurr) && + $fix) { + fix_delete_line($fixlinenr, $rawline); + my $fixed_line = $rawline; + $fixed_line =~ /(^..*$Type\s*$Ident\(.*\)\s*){(.*)$/; + my $line1 = $1; + my $line2 = $2; + fix_insert_line($fixlinenr, ltrim($line1)); + fix_insert_line($fixlinenr, "\+{"); + if ($line2 !~ /^\s*$/) { + fix_insert_line($fixlinenr, "\+\t" . trim($line2)); + } + } } # open braces for enum, union and struct go on the same line. if ($line =~ /^.\s*{/ && $prevline =~ /^.\s*(?:typedef\s+)?(enum|union|struct)(?:\s+$Ident)?\s*$/) { - ERROR("OPEN_BRACE", - "open brace '{' following $1 go on the same line\n" . $hereprev); + if (ERROR("OPEN_BRACE", + "open brace '{' following $1 go on the same line\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = rtrim($prevrawline) . " {"; + fix_insert_line($fixlinenr, $fixedline); + $fixedline = $rawline; + $fixedline =~ s/^(.\s*){\s*/$1\t/; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + } } # missing space after union, struct or enum definition @@ -2951,7 +3205,7 @@ sub process { if (WARN("SPACING", "missing space after $1 definition\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*(?:typedef\s+)?(?:enum|union|struct)(?:\s+$Ident){1,2})([=\{])/$1 $2/; } } @@ -3021,7 +3275,7 @@ sub process { } if (show_type("SPACING") && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*)$Declare\s*\(\s*\*\s*$Ident\s*\)\s*\(/$1 . $declare . $post_declare_space . '(*' . $funcname . ')('/ex; } } @@ -3038,7 +3292,7 @@ sub process { if (ERROR("BRACKET_SPACE", "space prohibited before open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(\+.*?)\s+\[/$1\[/; } } @@ -3073,7 +3327,7 @@ sub process { if (WARN("SPACING", "space prohibited between function name and open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$name\s+\(/$name\(/; } } @@ -3341,8 +3595,8 @@ sub process { $fixed_line = $fixed_line . $fix_elements[$#elements]; } - if ($fix && $line_fixed && $fixed_line ne $fixed[$linenr - 1]) { - $fixed[$linenr - 1] = $fixed_line; + if ($fix && $line_fixed && $fixed_line ne $fixed[$fixlinenr]) { + $fixed[$fixlinenr] = $fixed_line; } @@ -3353,7 +3607,7 @@ sub process { if (WARN("SPACING", "space prohibited before semicolon\n" . $herecurr) && $fix) { - 1 while $fixed[$linenr - 1] =~ + 1 while $fixed[$fixlinenr] =~ s/^(\+.*\S)\s+;/$1;/; } } @@ -3386,7 +3640,7 @@ sub process { if (ERROR("SPACING", "space required before the open brace '{'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/^(\+.*(?:do|\))){/$1 {/; + $fixed[$fixlinenr] =~ s/^(\+.*(?:do|\))){/$1 {/; } } @@ -3404,7 +3658,7 @@ sub process { if (ERROR("SPACING", "space required after that close brace '}'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/}((?!(?:,|;|\)))\S)/} $1/; } } @@ -3414,7 +3668,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open square bracket '['\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\[\s+/\[/; } } @@ -3422,7 +3676,7 @@ sub process { if (ERROR("SPACING", "space prohibited before that close square bracket ']'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\s+\]/\]/; } } @@ -3433,7 +3687,7 @@ sub process { if (ERROR("SPACING", "space prohibited after that open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\(\s+/\(/; } } @@ -3443,18 +3697,27 @@ sub process { if (ERROR("SPACING", "space prohibited before that close parenthesis ')'\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + print("fixlinenr: <$fixlinenr> fixed[fixlinenr]: <$fixed[$fixlinenr]>\n"); + $fixed[$fixlinenr] =~ s/\s+\)/\)/; } } +# check unnecessary parentheses around addressof/dereference single $Lvals +# ie: &(foo->bar) should be &foo->bar and *(foo->bar) should be *foo->bar + + while ($line =~ /(?:[^&]&\s*|\*)\(\s*($Ident\s*(?:$Member\s*)+)\s*\)/g) { + CHK("UNNECESSARY_PARENTHESES", + "Unnecessary parentheses around $1\n" . $herecurr); + } + #goto labels aren't indented, allow a single space however if ($line=~/^.\s+[A-Za-z\d_]+:(?![0-9]+)/ and !($line=~/^. [A-Za-z\d_]+:/) and !($line=~/^.\s+default:/)) { if (WARN("INDENTED_LABEL", "labels should not be indented\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.)\s+/$1/; } } @@ -3516,7 +3779,7 @@ sub process { if (ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b(if|while|for|switch)\(/$1 \(/; } } @@ -3606,7 +3869,7 @@ sub process { # if should not continue a brace if ($line =~ /}\s*if\b/) { ERROR("TRAILING_STATEMENTS", - "trailing statements should be on next line\n" . + "trailing statements should be on next line (or did you mean 'else if'?)\n" . $herecurr); } # case and default should not have general statements after them @@ -3622,14 +3885,26 @@ sub process { # Check for }<nl>else {, these must be at the same # indent level to be relevant to each other. - if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ and - $previndent == $indent) { - ERROR("ELSE_AFTER_BRACE", - "else should follow close brace '}'\n" . $hereprev); + if ($prevline=~/}\s*$/ and $line=~/^.\s*else\s*/ && + $previndent == $indent) { + if (ERROR("ELSE_AFTER_BRACE", + "else should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + $fixedline =~ s/}\s*$//; + if ($fixedline !~ /^\+\s*$/) { + fix_insert_line($fixlinenr, $fixedline); + } + $fixedline = $rawline; + $fixedline =~ s/^(.\s*)else/$1} else/; + fix_insert_line($fixlinenr, $fixedline); + } } - if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ and - $previndent == $indent) { + if ($prevline=~/}\s*$/ and $line=~/^.\s*while\s*/ && + $previndent == $indent) { my ($s, $c) = ctx_statement_block($linenr, $realcnt, 0); # Find out what is on the end of the line after the @@ -3638,8 +3913,18 @@ sub process { $s =~ s/\n.*//g; if ($s =~ /^\s*;/) { - ERROR("WHILE_AFTER_BRACE", - "while should follow close brace '}'\n" . $hereprev); + if (ERROR("WHILE_AFTER_BRACE", + "while should follow close brace '}'\n" . $hereprev) && + $fix && $prevline =~ /^\+/ && $line =~ /^\+/) { + fix_delete_line($fixlinenr - 1, $prevrawline); + fix_delete_line($fixlinenr, $rawline); + my $fixedline = $prevrawline; + my $trailing = $rawline; + $trailing =~ s/^\+//; + $trailing = trim($trailing); + $fixedline =~ s/}\s*$/} $trailing/; + fix_insert_line($fixlinenr, $fixedline); + } } } @@ -3653,7 +3938,7 @@ sub process { "Avoid gcc v4.3+ binary constant extension: <$var>\n" . $herecurr) && $fix) { my $hexval = sprintf("0x%x", oct($var)); - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/\b$var\b/$hexval/; } } @@ -3689,7 +3974,7 @@ sub process { if (WARN("WHITESPACE_AFTER_LINE_CONTINUATION", "Whitespace after \\ makes next lines useless\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\s+$//; + $fixed[$fixlinenr] =~ s/\s+$//; } } @@ -3762,7 +4047,7 @@ sub process { $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); $dstat !~ /^[!~-]?(?:$Lval|$Constant)$/ && # 10 // foo() // !foo // ~foo // -foo // foo->bar // foo.bar->baz - $dstat !~ /^'X'$/ && # character constants + $dstat !~ /^'X'$/ && $dstat !~ /^'XX'$/ && # character constants $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = $dstat !~ /^(?:\#\s*$Ident|\#\s*$Constant)\s*$/ && # stringification #foo @@ -4014,6 +4299,23 @@ sub process { } } +# check for unnecessary "Out of Memory" messages + if ($line =~ /^\+.*\b$logFunctions\s*\(/ && + $prevline =~ /^[ \+]\s*if\s*\(\s*(\!\s*|NULL\s*==\s*)?($Lval)(\s*==\s*NULL\s*)?\s*\)/ && + (defined $1 || defined $3) && + $linenr > 3) { + my $testval = $2; + my $testline = $lines[$linenr - 3]; + + my ($s, $c) = ctx_statement_block($linenr - 3, $realcnt, 0); +# print("line: <$line>\nprevline: <$prevline>\ns: <$s>\nc: <$c>\n\n\n"); + + if ($c =~ /(?:^|\n)[ \+]\s*(?:$Type\s*)?\Q$testval\E\s*=\s*(?:\([^\)]*\)\s*)?\s*(?:devm_)?(?:[kv][czm]alloc(?:_node|_array)?\b|kstrdup|(?:dev_)?alloc_skb)/) { + WARN("OOM_MESSAGE", + "Possible unnecessary 'out of memory' message\n" . $hereprev); + } + } + # check for bad placement of section $InitAttribute (e.g.: __initdata) if ($line =~ /(\b$InitAttribute\b)/) { my $attr = $1; @@ -4027,7 +4329,7 @@ sub process { WARN("MISPLACED_INIT", "$attr should be placed after $var\n" . $herecurr))) && $fix) { - $fixed[$linenr - 1] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; + $fixed[$fixlinenr] =~ s/(\bstatic\s+(?:const\s+)?)(?:$attr\s+)?($NonptrTypeWithAttr)\s+(?:$attr\s+)?($Ident(?:\[[^]]*\])?)\s*([=;])\s*/"$1" . trim(string_find_replace($2, "\\s*$attr\\s*", " ")) . " " . trim(string_find_replace($3, "\\s*$attr\\s*", "")) . " $attr" . ("$4" eq ";" ? ";" : " = ")/e; } } } @@ -4041,7 +4343,7 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of const init definition must use ${attr_prefix}initconst\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/$InitAttributeData/${attr_prefix}initconst/; } } @@ -4052,12 +4354,12 @@ sub process { if (ERROR("INIT_ATTRIBUTE", "Use of $attr requires a separate use of const\n" . $herecurr) && $fix) { - my $lead = $fixed[$linenr - 1] =~ + my $lead = $fixed[$fixlinenr] =~ /(^\+\s*(?:static\s+))/; $lead = rtrim($1); $lead = "$lead " if ($lead !~ /^\+$/); $lead = "${lead}const "; - $fixed[$linenr - 1] =~ s/(^\+\s*(?:static\s+))/$lead/; + $fixed[$fixlinenr] =~ s/(^\+\s*(?:static\s+))/$lead/; } } @@ -4070,7 +4372,7 @@ sub process { if (WARN("CONSTANT_CONVERSION", "$constant_func should be $func\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b$constant_func\b/$func/g; + $fixed[$fixlinenr] =~ s/\b$constant_func\b/$func/g; } } @@ -4120,7 +4422,7 @@ sub process { if (ERROR("SPACING", "exactly one space required after that #$1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ + $fixed[$fixlinenr] =~ s/^(.\s*\#\s*(ifdef|ifndef|elif))\s{2,}/$1 /; } @@ -4168,7 +4470,7 @@ sub process { if (WARN("INLINE", "plain inline is preferred over $1\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b(__inline__|__inline)\b/inline/; + $fixed[$fixlinenr] =~ s/\b(__inline__|__inline)\b/inline/; } } @@ -4193,7 +4495,7 @@ sub process { if (WARN("PREFER_PRINTF", "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf\s*,\s*(.*)\)\s*\)\s*\)/"__printf(" . trim($1) . ")"/ex; } } @@ -4204,7 +4506,7 @@ sub process { if (WARN("PREFER_SCANF", "__scanf(string-index, first-to-check) is preferred over __attribute__((format(scanf, string-index, first-to-check)))\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\b__attribute__\s*\(\s*\(\s*format\s*\(\s*scanf\s*,\s*(.*)\)\s*\)\s*\)/"__scanf(" . trim($1) . ")"/ex; } } @@ -4219,7 +4521,7 @@ sub process { if (WARN("SIZEOF_PARENTHESIS", "sizeof $1 should be sizeof($1)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; + $fixed[$fixlinenr] =~ s/\bsizeof\s+((?:\*\s*|)$Lval|$Type(?:\s+$Lval|))/"sizeof(" . trim($1) . ")"/ex; } } @@ -4242,7 +4544,7 @@ sub process { if (WARN("PREFER_SEQ_PUTS", "Prefer seq_puts to seq_printf\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bseq_printf\b/seq_puts/; + $fixed[$fixlinenr] =~ s/\bseq_printf\b/seq_puts/; } } } @@ -4271,7 +4573,7 @@ sub process { if (WARN("PREFER_ETHER_ADDR_COPY", "Prefer ether_addr_copy() over memcpy() if the Ethernet addresses are __aligned(2)\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; + $fixed[$fixlinenr] =~ s/\bmemcpy\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*ETH_ALEN\s*\)/ether_addr_copy($2, $7)/; } } @@ -4359,7 +4661,7 @@ sub process { if (CHK("AVOID_EXTERNS", "extern prototypes should be avoided in .h files\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; + $fixed[$fixlinenr] =~ s/(.*)\bextern\b\s*(.*)/$1$2/; } } @@ -4419,23 +4721,24 @@ sub process { # check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc if ($^V && $^V ge 5.10.0 && - $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) { + $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)\s*,/) { my $oldfunc = $3; my $a1 = $4; my $a2 = $10; my $newfunc = "kmalloc_array"; $newfunc = "kcalloc" if ($oldfunc eq "kzalloc"); - if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) { + my $r1 = $a1; + my $r2 = $a2; + if ($a1 =~ /^sizeof\s*\S/) { + $r1 = $a2; + $r2 = $a1; + } + if ($r1 !~ /^sizeof\b/ && $r2 =~ /^sizeof\s*\S/ && + !($r1 =~ /^$Constant$/ || $r1 =~ /^[A-Z_][A-Z0-9_]*$/)) { if (WARN("ALLOC_WITH_MULTIPLY", "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && $fix) { - my $r1 = $a1; - my $r2 = $a2; - if ($a1 =~ /^sizeof\s*\S/) { - $r1 = $a2; - $r2 = $a1; - } - $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; + $fixed[$fixlinenr] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; } } @@ -4459,7 +4762,7 @@ sub process { if (WARN("ONE_SEMICOLON", "Statements terminations use 1 semicolon\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/(\s*;\s*){2,}$/;/g; + $fixed[$fixlinenr] =~ s/(\s*;\s*){2,}$/;/g; } } @@ -4507,7 +4810,7 @@ sub process { if (WARN("USE_FUNC", "__func__ should be used instead of gcc specific __FUNCTION__\n" . $herecurr) && $fix) { - $fixed[$linenr - 1] =~ s/\b__FUNCTION__\b/__func__/g; + $fixed[$fixlinenr] =~ s/\b__FUNCTION__\b/__func__/g; } } @@ -4750,12 +5053,16 @@ sub process { hash_show_words(\%use_type, "Used"); hash_show_words(\%ignore_type, "Ignored"); - if ($clean == 0 && $fix && "@rawlines" ne "@fixed") { + if ($clean == 0 && $fix && + ("@rawlines" ne "@fixed" || + $#fixed_inserted >= 0 || $#fixed_deleted >= 0)) { my $newfile = $filename; $newfile .= ".EXPERIMENTAL-checkpatch-fixes" if (!$fix_inplace); my $linecount = 0; my $f; + @fixed = fix_inserted_deleted_lines(\@fixed, \@fixed_inserted, \@fixed_deleted); + open($f, '>', $newfile) or die "$P: Can't open $newfile for write\n"; foreach my $fixed_line (@fixed) { @@ -4763,7 +5070,7 @@ sub process { if ($file) { if ($linecount > 3) { $fixed_line =~ s/^\+//; - print $f $fixed_line. "\n"; + print $f $fixed_line . "\n"; } } else { print $f $fixed_line . "\n"; diff --git a/scripts/checkstack.pl b/scripts/checkstack.pl index c05d586b1fee..899b4230320e 100755 --- a/scripts/checkstack.pl +++ b/scripts/checkstack.pl @@ -52,14 +52,12 @@ my (@stack, $re, $dre, $x, $xs, $funcre); #8000008a: 20 1d sub sp,4 #80000ca8: fa cd 05 b0 sub sp,sp,1456 $re = qr/^.*sub.*sp.*,([0-9]{1,8})/o; - } elsif ($arch =~ /^i[3456]86$/) { + } elsif ($arch =~ /^x86(_64)?$/ || $arch =~ /^i[3456]86$/) { #c0105234: 81 ec ac 05 00 00 sub $0x5ac,%esp - $re = qr/^.*[as][du][db] \$(0x$x{1,8}),\%esp$/o; - $dre = qr/^.*[as][du][db] (%.*),\%esp$/o; - } elsif ($arch eq 'x86_64') { - # 2f60: 48 81 ec e8 05 00 00 sub $0x5e8,%rsp - $re = qr/^.*[as][du][db] \$(0x$x{1,8}),\%rsp$/o; - $dre = qr/^.*[as][du][db] (\%.*),\%rsp$/o; + # or + # 2f60: 48 81 ec e8 05 00 00 sub $0x5e8,%rsp + $re = qr/^.*[as][du][db] \$(0x$x{1,8}),\%(e|r)sp$/o; + $dre = qr/^.*[as][du][db] (%.*),\%(e|r)sp$/o; } elsif ($arch eq 'ia64') { #e0000000044011fc: 01 0f fc 8c adds r12=-384,r12 $re = qr/.*adds.*r12=-(([0-9]{2}|[3-9])[0-9]{2}),r12/o; diff --git a/scripts/coccinelle/free/ifnullfree.cocci b/scripts/coccinelle/free/ifnullfree.cocci new file mode 100644 index 000000000000..9321fe328a97 --- /dev/null +++ b/scripts/coccinelle/free/ifnullfree.cocci @@ -0,0 +1,54 @@ +/// NULL check before some freeing functions is not needed. +/// +/// Based on checkpatch warning +/// "kfree(NULL) is safe this check is probably not required" +/// and kfreeaddr.cocci by Julia Lawall. +/// +// Copyright: (C) 2014 Fabian Frederick. GPLv2. +// Comments: - +// Options: --no-includes --include-headers + +virtual patch +virtual org +virtual report +virtual context + +@r2 depends on patch@ +expression E; +@@ +- if (E) +( +- kfree(E); ++ kfree(E); +| +- debugfs_remove(E); ++ debugfs_remove(E); +| +- debugfs_remove_recursive(E); ++ debugfs_remove_recursive(E); +| +- usb_free_urb(E); ++ usb_free_urb(E); +) + +@r depends on context || report || org @ +expression E; +position p; +@@ + +* if (E) +* \(kfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|usb_free_urb\)(E); + +@script:python depends on org@ +p << r.p; +@@ + +cocci.print_main("NULL check before that freeing function is not needed", p) + +@script:python depends on report@ +p << r.p; +@@ + +msg = "WARNING: NULL check before freeing functions like kfree, debugfs_remove, debugfs_remove_recursive or usb_free_urb is not needed. Maybe consider reorganizing relevant code to avoid passing NULL values." +coccilib.report.print_report(p[0], msg) + diff --git a/scripts/tags.sh b/scripts/tags.sh index e6b011fe1d0d..cbfd269a6011 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -168,6 +168,7 @@ exuberant() --extra=+f --c-kinds=+px \ --regex-asm='/^(ENTRY|_GLOBAL)\(([^)]*)\).*/\2/' \ --regex-c='/^SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/sys_\1/' \ + --regex-c='/^COMPAT_SYSCALL_DEFINE[[:digit:]]?\(([^,)]*).*/compat_sys_\1/' \ --regex-c++='/^TRACE_EVENT\(([^,)]*).*/trace_\1/' \ --regex-c++='/^DEFINE_EVENT\([^,)]*, *([^,)]*).*/trace_\1/' \ --regex-c++='/PAGEFLAG\(([^,)]*).*/Page\1/' \ @@ -231,6 +232,7 @@ emacs() all_target_sources | xargs $1 -a \ --regex='/^\(ENTRY\|_GLOBAL\)(\([^)]*\)).*/\2/' \ --regex='/^SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/sys_\1/' \ + --regex='/^COMPAT_SYSCALL_DEFINE[0-9]?(\([^,)]*\).*/compat_sys_\1/' \ --regex='/^TRACE_EVENT(\([^,)]*\).*/trace_\1/' \ --regex='/^DEFINE_EVENT([^,)]*, *\([^,)]*\).*/trace_\1/' \ --regex='/PAGEFLAG(\([^,)]*\).*/Page\1/' \ |