From 1edf223485c42c99655dcd001db1e46ad5e5d2d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:06:57 -0800 Subject: [PATCH 001/124] mm/page-writeback.c: make determine_dirtyable_memory static again The tracing ring-buffer used this function briefly, but not anymore. Make it local to the writeback code again. Also, move the function so that no forward declaration needs to be reintroduced. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/writeback.h | 2 - mm/page-writeback.c | 122 +++++++++++++++++++------------------- 2 files changed, 60 insertions(+), 64 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a378c295851f8c..34a005515fef1c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -138,8 +138,6 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern unsigned long determine_dirtyable_memory(void); - extern int dirty_background_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8616ef3025a44a..c081bf62202b0a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -129,6 +129,66 @@ unsigned long global_dirty_limit; */ static struct prop_descriptor vm_completions; +/* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + /* * couple the period to the dirty_ratio: * @@ -196,7 +256,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write, return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -291,67 +350,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { From 34dbc67a644f11ab3475d822d72e25409911e760 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:06:59 -0800 Subject: [PATCH 002/124] vmscan: promote shared file mapped pages Commit 645747462435 ("vmscan: detect mapped file pages used only once") greatly decreases lifetime of single-used mapped file pages. Unfortunately it also decreases life time of all shared mapped file pages. Because after commit bf3f3bc5e7347 ("mm: don't mark_page_accessed in fault path") page-fault handler does not mark page active or even referenced. Thus page_check_references() activates file page only if it was used twice while it stays in inactive list, meanwhile it activates anon pages after first access. Inactive list can be small enough, this way reclaimer can accidentally throw away any widely used page if it wasn't used twice in short period. After this patch page_check_references() also activate file mapped page at first inactive list scan if this page is already used multiple times via several ptes. I found this while trying to fix degragation in rhel6 (~2.6.32) from rhel5 (~2.6.18). There a complete mess with >100 web/mail/spam/ftp containers, they share all their files but there a lot of anonymous pages: ~500mb shared file mapped memory and 15-20Gb non-shared anonymous memory. In this situation major-pagefaults are very costly, because all containers share the same page. In my load kernel created a disproportionate pressure on the file memory, compared with the anonymous, they equaled only if I raise swappiness up to 150 =) These patches actually wasn't helped a lot in my problem, but I saw noticable (10-20 times) reduce in count and average time of major-pagefault in file-mapped areas. Actually both patches are fixes for commit v2.6.33-5448-g6457474, because it was aimed at one scenario (singly used pages), but it breaks the logic in other scenarios (shared and/or executable pages) Signed-off-by: Konstantin Khlebnikov Acked-by: Pekka Enberg Acked-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 11adc890ce30bb..753c1e6755f098 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -715,7 +715,7 @@ static enum page_references page_check_references(struct page *page, */ SetPageReferenced(page); - if (referenced_page) + if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; From c909e99364c8b6ca07864d752950b6b4ecf6bef4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:03 -0800 Subject: [PATCH 003/124] vmscan: activate executable pages after first usage Logic added in commit 8cab4754d24a0 ("vmscan: make mapped executable pages the first class citizen") was noticeably weakened in commit 645747462435d84 ("vmscan: detect mapped file pages used only once"). Currently these pages can become "first class citizens" only after second usage. After this patch page_check_references() will activate they after first usage, and executable code gets yet better chance to stay in memory. Signed-off-by: Konstantin Khlebnikov Cc: Pekka Enberg Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Wu Fengguang Cc: Johannes Weiner Cc: Nick Piggin Cc: Mel Gorman Cc: Shaohua Li Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 753c1e6755f098..753a2dc300b983 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -718,6 +718,12 @@ static enum page_references page_check_references(struct page *page, if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; + /* + * Activate file-backed executable pages after first usage. + */ + if (vm_flags & VM_EXEC) + return PAGEREF_ACTIVATE; + return PAGEREF_KEEP; } From cc59850ef940e4ee6a765d28b439b9bafe07cf63 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:04 -0800 Subject: [PATCH 004/124] mm: add free_hot_cold_page_list() helper This patch adds helper free_hot_cold_page_list() to free list of 0-order pages. It frees pages directly from list without temporary page-vector. It also calls trace_mm_pagevec_free() to simulate pagevec_free() behaviour. bloat-o-meter: add/remove: 1/1 grow/shrink: 1/3 up/down: 267/-295 (-28) function old new delta free_hot_cold_page_list - 264 +264 get_page_from_freelist 2129 2132 +3 __pagevec_free 243 239 -4 split_free_page 380 373 -7 release_pages 606 510 -96 free_page_list 188 - -188 Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Acked-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 1 + mm/page_alloc.c | 13 +++++++++++++ mm/swap.c | 14 +++----------- mm/vmscan.c | 20 +------------------- 4 files changed, 18 insertions(+), 30 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3a76faf6a3ee82..656295865d5841 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -358,6 +358,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); +extern void free_hot_cold_page_list(struct list_head *list, int cold); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7990ca154d1b60..cd0c95c6cc9e13 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1188,6 +1188,19 @@ void free_hot_cold_page(struct page *page, int cold) local_irq_restore(flags); } +/* + * Free a list of 0-order pages + */ +void free_hot_cold_page_list(struct list_head *list, int cold) +{ + struct page *page, *next; + + list_for_each_entry_safe(page, next, list, lru) { + trace_mm_pagevec_free(page, cold); + free_hot_cold_page(page, cold); + } +} + /* * split_page takes a non-compound higher-order page, and splits it into * n (1<lru_lock, flags); - zone = NULL; - } - __pagevec_free(&pages_to_free); - pagevec_reinit(&pages_to_free); - } + list_add(&page->lru, &pages_to_free); } if (zone) spin_unlock_irqrestore(&zone->lru_lock, flags); - pagevec_free(&pages_to_free); + free_hot_cold_page_list(&pages_to_free, cold); } EXPORT_SYMBOL(release_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 753a2dc300b983..3d571df41c7917 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -734,24 +734,6 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } -static noinline_for_stack void free_page_list(struct list_head *free_pages) -{ - struct pagevec freed_pvec; - struct page *page, *tmp; - - pagevec_init(&freed_pvec, 1); - - list_for_each_entry_safe(page, tmp, free_pages, lru) { - list_del(&page->lru); - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } - } - - pagevec_free(&freed_pvec); -} - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -1015,7 +997,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc)) zone_set_flag(zone, ZONE_CONGESTED); - free_page_list(&free_pages); + free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); From da066ad3570b88e7dee82e76a06ee9a7adffcf0d Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:06 -0800 Subject: [PATCH 005/124] mm: remove unused pagevec_free It not exported and now nobody uses it. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 7 ------- mm/page_alloc.c | 10 ---------- 2 files changed, 17 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index bab82f4c571c61..ed17024d2ebee5 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -21,7 +21,6 @@ struct pagevec { }; void __pagevec_release(struct pagevec *pvec); -void __pagevec_free(struct pagevec *pvec); void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru); void pagevec_strip(struct pagevec *pvec); unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping, @@ -67,12 +66,6 @@ static inline void pagevec_release(struct pagevec *pvec) __pagevec_release(pvec); } -static inline void pagevec_free(struct pagevec *pvec) -{ - if (pagevec_count(pvec)) - __pagevec_free(pvec); -} - static inline void __pagevec_lru_add_anon(struct pagevec *pvec) { ____pagevec_lru_add(pvec, LRU_INACTIVE_ANON); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cd0c95c6cc9e13..6c77efbca5bc05 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2319,16 +2319,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask) } EXPORT_SYMBOL(get_zeroed_page); -void __pagevec_free(struct pagevec *pvec) -{ - int i = pagevec_count(pvec); - - while (--i >= 0) { - trace_mm_pagevec_free(pvec->pages[i], pvec->cold); - free_hot_cold_page(pvec->pages[i], pvec->cold); - } -} - void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { From b413d48aa70605701c0b395b2e350ca15f5d643a Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:09 -0800 Subject: [PATCH 006/124] mm-tracepoint: rename page-free events Rename mm_page_free_direct into mm_page_free and mm_pagevec_free into mm_page_free_batched Since v2.6.33-5426-gc475dab the kernel triggers mm_page_free_direct for all freed pages, not only for directly freed. So, let's name it properly. For pages freed via page-list we also trigger mm_page_free_batched event. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/trace/events-kmem.txt | 12 +++++------ .../trace-pagealloc-postprocess.pl | 20 +++++++++---------- include/trace/events/kmem.h | 4 ++-- mm/page_alloc.c | 4 ++-- 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/Documentation/trace/events-kmem.txt b/Documentation/trace/events-kmem.txt index aa82ee4a5a8762..194800410061b3 100644 --- a/Documentation/trace/events-kmem.txt +++ b/Documentation/trace/events-kmem.txt @@ -40,8 +40,8 @@ but the call_site can usually be used to extrapolate that information. ================== mm_page_alloc page=%p pfn=%lu order=%d migratetype=%d gfp_flags=%s mm_page_alloc_zone_locked page=%p pfn=%lu order=%u migratetype=%d cpu=%d percpu_refill=%d -mm_page_free_direct page=%p pfn=%lu order=%d -mm_pagevec_free page=%p pfn=%lu order=%d cold=%d +mm_page_free page=%p pfn=%lu order=%d +mm_page_free_batched page=%p pfn=%lu order=%d cold=%d These four events deal with page allocation and freeing. mm_page_alloc is a simple indicator of page allocator activity. Pages may be allocated from @@ -53,13 +53,13 @@ amounts of activity imply high activity on the zone->lock. Taking this lock impairs performance by disabling interrupts, dirtying cache lines between CPUs and serialising many CPUs. -When a page is freed directly by the caller, the mm_page_free_direct event +When a page is freed directly by the caller, the only mm_page_free event is triggered. Significant amounts of activity here could indicate that the callers should be batching their activities. -When pages are freed using a pagevec, the mm_pagevec_free is -triggered. Broadly speaking, pages are taken off the LRU lock in bulk and -freed in batch with a pagevec. Significant amounts of activity here could +When pages are freed in batch, the also mm_page_free_batched is triggered. +Broadly speaking, pages are taken off the LRU lock in bulk and +freed in batch with a page list. Significant amounts of activity here could indicate that the system is under memory pressure and can also indicate contention on the zone->lru_lock. diff --git a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl index 7df50e8cf4d951..0a120aae33ce5c 100644 --- a/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl +++ b/Documentation/trace/postprocess/trace-pagealloc-postprocess.pl @@ -17,8 +17,8 @@ # Tracepoint events use constant MM_PAGE_ALLOC => 1; -use constant MM_PAGE_FREE_DIRECT => 2; -use constant MM_PAGEVEC_FREE => 3; +use constant MM_PAGE_FREE => 2; +use constant MM_PAGE_FREE_BATCHED => 3; use constant MM_PAGE_PCPU_DRAIN => 4; use constant MM_PAGE_ALLOC_ZONE_LOCKED => 5; use constant MM_PAGE_ALLOC_EXTFRAG => 6; @@ -223,10 +223,10 @@ sub process_events { # Perl Switch() sucks majorly if ($tracepoint eq "mm_page_alloc") { $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}++; - } elsif ($tracepoint eq "mm_page_free_direct") { - $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}++; - } elsif ($tracepoint eq "mm_pagevec_free") { - $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}++; + } elsif ($tracepoint eq "mm_page_free") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE}++ + } elsif ($tracepoint eq "mm_page_free_batched") { + $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}++; } elsif ($tracepoint eq "mm_page_pcpu_drain") { $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}++; $perprocesspid{$process_pid}->{STATE_PCPU_PAGES_DRAINED}++; @@ -336,8 +336,8 @@ sub dump_stats { $process_pid, $stats{$process_pid}->{MM_PAGE_ALLOC}, $stats{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}, - $stats{$process_pid}->{MM_PAGE_FREE_DIRECT}, - $stats{$process_pid}->{MM_PAGEVEC_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE}, + $stats{$process_pid}->{MM_PAGE_FREE_BATCHED}, $stats{$process_pid}->{MM_PAGE_PCPU_DRAIN}, $stats{$process_pid}->{HIGH_PCPU_DRAINS}, $stats{$process_pid}->{HIGH_PCPU_REFILLS}, @@ -364,8 +364,8 @@ () $perprocess{$process}->{MM_PAGE_ALLOC} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC}; $perprocess{$process}->{MM_PAGE_ALLOC_ZONE_LOCKED} += $perprocesspid{$process_pid}->{MM_PAGE_ALLOC_ZONE_LOCKED}; - $perprocess{$process}->{MM_PAGE_FREE_DIRECT} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_DIRECT}; - $perprocess{$process}->{MM_PAGEVEC_FREE} += $perprocesspid{$process_pid}->{MM_PAGEVEC_FREE}; + $perprocess{$process}->{MM_PAGE_FREE} += $perprocesspid{$process_pid}->{MM_PAGE_FREE}; + $perprocess{$process}->{MM_PAGE_FREE_BATCHED} += $perprocesspid{$process_pid}->{MM_PAGE_FREE_BATCHED}; $perprocess{$process}->{MM_PAGE_PCPU_DRAIN} += $perprocesspid{$process_pid}->{MM_PAGE_PCPU_DRAIN}; $perprocess{$process}->{HIGH_PCPU_DRAINS} += $perprocesspid{$process_pid}->{HIGH_PCPU_DRAINS}; $perprocess{$process}->{HIGH_PCPU_REFILLS} += $perprocesspid{$process_pid}->{HIGH_PCPU_REFILLS}; diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index a9c87ad8331c61..5f889f16b0c891 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -147,7 +147,7 @@ DEFINE_EVENT(kmem_free, kmem_cache_free, TP_ARGS(call_site, ptr) ); -TRACE_EVENT(mm_page_free_direct, +TRACE_EVENT(mm_page_free, TP_PROTO(struct page *page, unsigned int order), @@ -169,7 +169,7 @@ TRACE_EVENT(mm_page_free_direct, __entry->order) ); -TRACE_EVENT(mm_pagevec_free, +TRACE_EVENT(mm_page_free_batched, TP_PROTO(struct page *page, int cold), diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c77efbca5bc05..516ab623d773c3 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -632,7 +632,7 @@ static bool free_pages_prepare(struct page *page, unsigned int order) int i; int bad = 0; - trace_mm_page_free_direct(page, order); + trace_mm_page_free(page, order); kmemcheck_free_shadow(page, order); if (PageAnon(page)) @@ -1196,7 +1196,7 @@ void free_hot_cold_page_list(struct list_head *list, int cold) struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) { - trace_mm_pagevec_free(page, cold); + trace_mm_page_free_batched(page, cold); free_hot_cold_page(page, cold); } } From 90a5d5af74f6570af063fb6bff33c6b2f8361bbc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:10 -0800 Subject: [PATCH 007/124] mm-tracepoint: fix documentation and examples We renamed the page-free mm tracepoints. Signed-off-by: Konstantin Khlebnikov Cc: Mel Gorman Cc: KOSAKI Motohiro Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/trace/tracepoint-analysis.txt | 40 ++++++++++----------- tools/perf/Documentation/examples.txt | 34 +++++++++--------- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/Documentation/trace/tracepoint-analysis.txt b/Documentation/trace/tracepoint-analysis.txt index 87bee3c129ba71..058cc6c9dc56d4 100644 --- a/Documentation/trace/tracepoint-analysis.txt +++ b/Documentation/trace/tracepoint-analysis.txt @@ -93,14 +93,14 @@ By specifying the -a switch and analysing sleep, the system-wide events for a duration of time can be examined. $ perf stat -a \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ sleep 10 Performance counter stats for 'sleep 10': 9630 kmem:mm_page_alloc - 2143 kmem:mm_page_free_direct - 7424 kmem:mm_pagevec_free + 2143 kmem:mm_page_free + 7424 kmem:mm_page_free_batched 10.002577764 seconds time elapsed @@ -119,15 +119,15 @@ basis using set_ftrace_pid. Events can be activated and tracked for the duration of a process on a local basis using PCL such as follows. - $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free ./hackbench 10 + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched ./hackbench 10 Time: 0.909 Performance counter stats for './hackbench 10': 17803 kmem:mm_page_alloc - 12398 kmem:mm_page_free_direct - 4827 kmem:mm_pagevec_free + 12398 kmem:mm_page_free + 4827 kmem:mm_page_free_batched 0.973913387 seconds time elapsed @@ -146,8 +146,8 @@ to know what the standard deviation is. By and large, this is left to the performance analyst to do it by hand. In the event that the discrete event occurrences are useful to the performance analyst, then perf can be used. - $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free_direct - -e kmem:mm_pagevec_free ./hackbench 10 + $ perf stat --repeat 5 -e kmem:mm_page_alloc -e kmem:mm_page_free + -e kmem:mm_page_free_batched ./hackbench 10 Time: 0.890 Time: 0.895 Time: 0.915 @@ -157,8 +157,8 @@ occurrences are useful to the performance analyst, then perf can be used. Performance counter stats for './hackbench 10' (5 runs): 16630 kmem:mm_page_alloc ( +- 3.542% ) - 11486 kmem:mm_page_free_direct ( +- 4.771% ) - 4730 kmem:mm_pagevec_free ( +- 2.325% ) + 11486 kmem:mm_page_free ( +- 4.771% ) + 4730 kmem:mm_page_free_batched ( +- 2.325% ) 0.982653002 seconds time elapsed ( +- 1.448% ) @@ -168,15 +168,15 @@ aggregation of discrete events, then a script would need to be developed. Using --repeat, it is also possible to view how events are fluctuating over time on a system-wide basis using -a and sleep. - $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + $ perf stat -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ -a --repeat 10 \ sleep 1 Performance counter stats for 'sleep 1' (10 runs): 1066 kmem:mm_page_alloc ( +- 26.148% ) - 182 kmem:mm_page_free_direct ( +- 5.464% ) - 890 kmem:mm_pagevec_free ( +- 30.079% ) + 182 kmem:mm_page_free ( +- 5.464% ) + 890 kmem:mm_page_free_batched ( +- 30.079% ) 1.002251757 seconds time elapsed ( +- 0.005% ) @@ -220,8 +220,8 @@ were generating events within the kernel. To begin this sort of analysis, the data must be recorded. At the time of writing, this required root: $ perf record -c 1 \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ ./hackbench 10 Time: 0.894 [ perf record: Captured and wrote 0.733 MB perf.data (~32010 samples) ] @@ -260,8 +260,8 @@ noticed that X was generating an insane amount of page allocations so let's look at it: $ perf record -c 1 -f \ - -e kmem:mm_page_alloc -e kmem:mm_page_free_direct \ - -e kmem:mm_pagevec_free \ + -e kmem:mm_page_alloc -e kmem:mm_page_free \ + -e kmem:mm_page_free_batched \ -p `pidof X` This was interrupted after a few seconds and diff --git a/tools/perf/Documentation/examples.txt b/tools/perf/Documentation/examples.txt index 8eb6c489fb152c..77f95276242666 100644 --- a/tools/perf/Documentation/examples.txt +++ b/tools/perf/Documentation/examples.txt @@ -17,8 +17,8 @@ titan:~> perf list kmem:kmem_cache_alloc_node [Tracepoint event] kmem:kfree [Tracepoint event] kmem:kmem_cache_free [Tracepoint event] - kmem:mm_page_free_direct [Tracepoint event] - kmem:mm_pagevec_free [Tracepoint event] + kmem:mm_page_free [Tracepoint event] + kmem:mm_page_free_batched [Tracepoint event] kmem:mm_page_alloc [Tracepoint event] kmem:mm_page_alloc_zone_locked [Tracepoint event] kmem:mm_page_pcpu_drain [Tracepoint event] @@ -29,15 +29,15 @@ measured. For example the page alloc/free properties of a 'hackbench run' are: titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc - -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10 + -e kmem:mm_page_free_batched -e kmem:mm_page_free ./hackbench 10 Time: 0.575 Performance counter stats for './hackbench 10': 13857 kmem:mm_page_pcpu_drain 27576 kmem:mm_page_alloc - 6025 kmem:mm_pagevec_free - 20934 kmem:mm_page_free_direct + 6025 kmem:mm_page_free_batched + 20934 kmem:mm_page_free 0.613972165 seconds time elapsed @@ -45,8 +45,8 @@ You can observe the statistical properties as well, by using the 'repeat the workload N times' feature of perf stat: titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e - kmem:mm_page_alloc -e kmem:mm_pagevec_free -e - kmem:mm_page_free_direct ./hackbench 10 + kmem:mm_page_alloc -e kmem:mm_page_free_batched -e + kmem:mm_page_free ./hackbench 10 Time: 0.627 Time: 0.644 Time: 0.564 @@ -57,8 +57,8 @@ You can observe the statistical properties as well, by using the 12920 kmem:mm_page_pcpu_drain ( +- 3.359% ) 25035 kmem:mm_page_alloc ( +- 3.783% ) - 6104 kmem:mm_pagevec_free ( +- 0.934% ) - 18376 kmem:mm_page_free_direct ( +- 4.941% ) + 6104 kmem:mm_page_free_batched ( +- 0.934% ) + 18376 kmem:mm_page_free ( +- 4.941% ) 0.643954516 seconds time elapsed ( +- 2.363% ) @@ -158,15 +158,15 @@ Or you can observe the whole system's page allocations for 10 seconds: titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e -kmem:mm_page_alloc -e kmem:mm_pagevec_free -e -kmem:mm_page_free_direct sleep 10 +kmem:mm_page_alloc -e kmem:mm_page_free_batched -e +kmem:mm_page_free sleep 10 Performance counter stats for 'sleep 10': 171585 kmem:mm_page_pcpu_drain 322114 kmem:mm_page_alloc - 73623 kmem:mm_pagevec_free - 254115 kmem:mm_page_free_direct + 73623 kmem:mm_page_free_batched + 254115 kmem:mm_page_free 10.000591410 seconds time elapsed @@ -174,15 +174,15 @@ Or observe how fluctuating the page allocations are, via statistical analysis done over ten 1-second intervals: titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e - kmem:mm_page_alloc -e kmem:mm_pagevec_free -e - kmem:mm_page_free_direct sleep 1 + kmem:mm_page_alloc -e kmem:mm_page_free_batched -e + kmem:mm_page_free sleep 1 Performance counter stats for 'sleep 1' (10 runs): 17254 kmem:mm_page_pcpu_drain ( +- 3.709% ) 34394 kmem:mm_page_alloc ( +- 4.617% ) - 7509 kmem:mm_pagevec_free ( +- 4.820% ) - 25653 kmem:mm_page_free_direct ( +- 3.672% ) + 7509 kmem:mm_page_free_batched ( +- 4.820% ) + 25653 kmem:mm_page_free ( +- 3.672% ) 1.058135029 seconds time elapsed ( +- 3.089% ) From 937a94c9db30a818baa5e2c09dbf4589251355c3 Mon Sep 17 00:00:00 2001 From: Jacobo Giralt Date: Tue, 10 Jan 2012 15:07:11 -0800 Subject: [PATCH 008/124] mm: migrate: one less atomic operation migrate_page_move_mapping() drops a reference from the old page after unfreezing its counter. Both operations can be merged into a single atomic operation by directly unfreezing to one less reference. The same applies to migrate_huge_page_move_mapping(). Signed-off-by: Jacobo Giralt Cc: Mel Gorman Cc: Minchan Kim Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 177aca424a069a..594dc375d0f969 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -269,12 +269,12 @@ static int migrate_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); /* - * Drop cache reference from old page. + * Drop cache reference from old page by unfreezing + * to one less reference. * We know this isn't the last reference. */ - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); /* * If moved to a different zone then also account @@ -334,9 +334,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, radix_tree_replace_slot(pslot, newpage); - page_unfreeze_refs(page, expected_count); - - __put_page(page); + page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); return 0; From 938929f14cb595f43cd1a4e63e22d36cab1e4a1f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:14 -0800 Subject: [PATCH 009/124] mm: reduce the amount of work done when updating min_free_kbytes When min_free_kbytes is updated, some pageblocks are marked MIGRATE_RESERVE. Ordinarily, this work is unnoticable as it happens early in boot but on large machines with 1TB of memory, this has been reported to delay boot times, probably due to the NUMA distances involved. The bulk of the work is due to calling calling pageblock_is_reserved() an unnecessary amount of times and accessing far more struct page metadata than is necessary. This patch significantly reduces the amount of work done by setup_zone_migrate_reserve() improving boot times on 1TB machines. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 516ab623d773c3..671e6c94fed77f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3388,25 +3388,33 @@ static void setup_zone_migrate_reserve(struct zone *zone) if (page_to_nid(page) != zone_to_nid(zone)) continue; - /* Blocks with reserved pages will never free, skip them. */ - block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); - if (pageblock_is_reserved(pfn, block_end_pfn)) - continue; - block_migratetype = get_pageblock_migratetype(page); - /* If this block is reserved, account for it */ - if (reserve > 0 && block_migratetype == MIGRATE_RESERVE) { - reserve--; - continue; - } + /* Only test what is necessary when the reserves are not met */ + if (reserve > 0) { + /* + * Blocks with reserved pages will never free, skip + * them. + */ + block_end_pfn = min(pfn + pageblock_nr_pages, end_pfn); + if (pageblock_is_reserved(pfn, block_end_pfn)) + continue; - /* Suitable for reserving if this block is movable */ - if (reserve > 0 && block_migratetype == MIGRATE_MOVABLE) { - set_pageblock_migratetype(page, MIGRATE_RESERVE); - move_freepages_block(zone, page, MIGRATE_RESERVE); - reserve--; - continue; + /* If this block is reserved, account for it */ + if (block_migratetype == MIGRATE_RESERVE) { + reserve--; + continue; + } + + /* Suitable for reserving if this block is movable */ + if (block_migratetype == MIGRATE_MOVABLE) { + set_pageblock_migratetype(page, + MIGRATE_RESERVE); + move_freepages_block(zone, page, + MIGRATE_RESERVE); + reserve--; + continue; + } } /* From f90ac3982a78d36f894824636beeef13361d7c59 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Jan 2012 15:07:15 -0800 Subject: [PATCH 010/124] mm: avoid livelock on !__GFP_FS allocations Colin Cross reported; Under the following conditions, __alloc_pages_slowpath can loop forever: gfp_mask & __GFP_WAIT is true gfp_mask & __GFP_FS is false reclaim and compaction make no progress order <= PAGE_ALLOC_COSTLY_ORDER These conditions happen very often during suspend and resume, when pm_restrict_gfp_mask() effectively converts all GFP_KERNEL allocations into __GFP_WAIT. The oom killer is not run because gfp_mask & __GFP_FS is false, but should_alloc_retry will always return true when order is less than PAGE_ALLOC_COSTLY_ORDER. In his fix, he avoided retrying the allocation if reclaim made no progress and __GFP_FS was not set. The problem is that this would result in GFP_NOIO allocations failing that previously succeeded which would be very unfortunate. The big difference between GFP_NOIO and suspend converting GFP_KERNEL to behave like GFP_NOIO is that normally flushers will be cleaning pages and kswapd reclaims pages allowing GFP_NOIO to succeed after a short delay. The same does not necessarily apply during suspend as the storage device may be suspended. This patch special cases the suspend case to fail the page allocation if reclaim cannot make progress and adds some documentation on how gfp_allowed_mask is currently used. Failing allocations like this may cause suspend to abort but that is better than a livelock. [mgorman@suse.de: Rework fix to be suspend specific] [rientjes@google.com: Move suspended device check to should_alloc_retry] Reported-by: Colin Cross Signed-off-by: Mel Gorman Acked-by: David Rientjes Cc: Minchan Kim Cc: Pekka Enberg Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 16 ++++++++++++++++ mm/page_alloc.c | 30 ++++++++++++++++++++++-------- mm/swapfile.c | 6 +++--- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 656295865d5841..91812df1351af6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -368,9 +368,25 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(void); void drain_local_pages(void *dummy); +/* + * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what + * GFP flags are used before interrupts are enabled. Once interrupts are + * enabled, it is set to __GFP_BITS_MASK while the system is running. During + * hibernation, it is used by PM to avoid I/O during memory allocation while + * devices are suspended. + */ extern gfp_t gfp_allowed_mask; extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); +#ifdef CONFIG_PM_SLEEP +extern bool pm_suspended_storage(void); +#else +static inline bool pm_suspended_storage(void) +{ + return false; +} +#endif /* CONFIG_PM_SLEEP */ + #endif /* __LINUX_GFP_H */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 671e6c94fed77f..3cba4b67203f03 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -127,6 +127,13 @@ void pm_restrict_gfp_mask(void) saved_gfp_mask = gfp_allowed_mask; gfp_allowed_mask &= ~GFP_IOFS; } + +bool pm_suspended_storage(void) +{ + if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS) + return false; + return true; +} #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -1786,12 +1793,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) static inline int should_alloc_retry(gfp_t gfp_mask, unsigned int order, + unsigned long did_some_progress, unsigned long pages_reclaimed) { /* Do not loop if specifically requested */ if (gfp_mask & __GFP_NORETRY) return 0; + /* Always retry if specifically requested */ + if (gfp_mask & __GFP_NOFAIL) + return 1; + + /* + * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim + * making forward progress without invoking OOM. Suspend also disables + * storage devices so kswapd will not help. Bail if we are suspending. + */ + if (!did_some_progress && pm_suspended_storage()) + return 0; + /* * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER * means __GFP_NOFAIL, but that may not be true in other @@ -1810,13 +1830,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order)) return 1; - /* - * Don't let big-order allocations loop unless the caller - * explicitly requests that. - */ - if (gfp_mask & __GFP_NOFAIL) - return 1; - return 0; } @@ -2209,7 +2222,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* Check if we should retry the allocation */ pages_reclaimed += did_some_progress; - if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { + if (should_alloc_retry(gfp_mask, order, did_some_progress, + pages_reclaimed)) { /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; diff --git a/mm/swapfile.c b/mm/swapfile.c index b1cd120607230b..9520592d4231e4 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibernation clears bits from gfp_allowed_mask to prevent - * memory reclaim from writing to disk, so check that here. + * Hibration suspends storage while it is writing the image + * to disk so check that here. */ - if (!(gfp_allowed_mask & __GFP_IO)) + if (pm_suspended_storage()) return 0; delete_from_swap_cache(page); From 5f8aefd44e64ed2f6950a1dcc77309b7dd9979f4 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 10 Jan 2012 15:07:18 -0800 Subject: [PATCH 011/124] mm: account reaped page cache on inode cache pruning Inode cache pruning indirectly reclaims page-cache by invalidating mapping pages. Let's account them into reclaim-state to notice this progress in memory reclaimer. Signed-off-by: Konstantin Khlebnikov Cc: Dave Chinner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index 87535753ab04a7..4fa4f0916af904 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -776,6 +776,8 @@ void prune_icache_sb(struct super_block *sb, int nr_to_scan) else __count_vm_events(PGINODESTEAL, reap); spin_unlock(&sb->s_inode_lru_lock); + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += reap; dispose_list(&freeable); } From a734bcc812146cfba530e1adaf609fce1357982e Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:07:20 -0800 Subject: [PATCH 012/124] hugetlb: detect race upon page allocation failure during COW Currently we are not rechecking pte_same in hugetlb_cow after we take ptl lock again in the page allocation failure code path and simply retry again. This is not an issue at the moment because hugetlb fault path is protected by hugetlb_instantiation_mutex so we cannot race. The original page is locked and so we cannot race even with the page migration. Let's add the pte_same check anyway as we want to be consistent with the other check later in this function and be safe if we ever remove the mutex. [mhocko@suse.cz: reworded the changelog] Signed-off-by: Hillf Danton Signed-off-by: Michal Hocko Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7acd12503f734b..2c551b28ba69c6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2408,7 +2408,14 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); spin_lock(&mm->page_table_lock); - goto retry_avoidcopy; + ptep = huge_pte_offset(mm, address & huge_page_mask(h)); + if (likely(pte_same(huge_ptep_get(ptep), pte))) + goto retry_avoidcopy; + /* + * race occurs while re-acquiring page_table_lock, and + * our job is done. + */ + return 0; } WARN_ON_ONCE(1); } From ef009b25f4f8a77d2b32067d424d5ac757dcdc5b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2012 15:07:21 -0800 Subject: [PATCH 013/124] hugetlb: clarify hugetlb_instantiation_mutex usage Let's make it clear that we cannot race with other fault handlers due to hugetlb (global) mutex. Also make it clear that we want to keep pte_same checks anayway to have a transition from the global mutex easier. Signed-off-by: Michal Hocko Cc: Hillf Danton Cc: Andrea Arcangeli Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c551b28ba69c6..49e693b7fd0c69 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2349,6 +2349,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, /* * Hugetlb_cow() should be called with page lock of the original hugepage held. + * Called with hugetlb_instantiation_mutex held and pte_page locked so we + * cannot race with other handlers or page migration. + * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, From 1e16a539ac16e7b3a8c2cee188897d4bdb88e6e8 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:07:22 -0800 Subject: [PATCH 014/124] mm/hugetlb.c: fix virtual address handling in hugetlb fault handle_mm_fault() passes 'faulted' address to hugetlb_fault(). This address is not aligned to a hugepage boundary. Most of the functions for hugetlb pages are aware of that and calculate an alignment themselves. However some functions such as copy_user_huge_page() and clear_huge_page() don't handle alignment by themselves. This patch make hugeltb_fault() fix the alignment and pass an aligned addresss (to address of a faulted hugepage) to functions. [akpm@linux-foundation.org: use &=] Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 49e693b7fd0c69..ab89d6f382d127 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2640,6 +2640,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + address &= huge_page_mask(h); + ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); From 1399ff86f2a2bbacbbe68fa00c5f8c752b344723 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:07:25 -0800 Subject: [PATCH 015/124] kernel.h: add BUILD_BUG() macro We can place this in definitions that we expect the compiler to remove by dead code elimination. If this assertion fails, we get a nice error message at build time. The GCC function attribute error("message") was added in version 4.3, so we define a new macro __linktime_error(message) to expand to this for GCC-4.3 and later. This will give us an error diagnostic from the compiler on the line that fails. For other compilers __linktime_error(message) expands to nothing, and we have to be content with a link time error, but at least we will still get a build error. BUILD_BUG() expands to the undefined function __build_bug_failed() and will fail at link time if the compiler ever emits code for it. On GCC-4.3 and later, attribute((error())) is used so that the failure will be noted at compile time instead. Signed-off-by: David Daney Acked-by: David Rientjes Cc: DM Cc: Ralf Baechle Acked-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compiler-gcc4.h | 1 + include/linux/compiler.h | 4 +++- include/linux/kernel.h | 16 ++++++++++++++++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h index dfadc96e9d6385..2f4079175afb81 100644 --- a/include/linux/compiler-gcc4.h +++ b/include/linux/compiler-gcc4.h @@ -29,6 +29,7 @@ the kernel context */ #define __cold __attribute__((__cold__)) +#define __linktime_error(message) __attribute__((__error__(message))) #if __GNUC_MINOR__ >= 5 /* diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 320d6c94ff848d..4a243546d142b3 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -293,7 +293,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #ifndef __compiletime_error # define __compiletime_error(message) #endif - +#ifndef __linktime_error +# define __linktime_error(message) +#endif /* * Prevent the compiler from merging or refetching accesses. The compiler * is also forbidden from reordering successive instances of ACCESS_ONCE(), diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e8b1597b5cf259..f48e8a52854427 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -665,6 +665,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define BUILD_BUG_ON_ZERO(e) (0) #define BUILD_BUG_ON_NULL(e) ((void*)0) #define BUILD_BUG_ON(condition) +#define BUILD_BUG() (0) #else /* __CHECKER__ */ /* Force a compilation error if a constant expression is not a power of 2 */ @@ -703,6 +704,21 @@ extern int __build_bug_on_failed; if (condition) __build_bug_on_failed = 1; \ } while(0) #endif + +/** + * BUILD_BUG - break compile if used. + * + * If you have some code that you expect the compiler to eliminate at + * build time, you should use BUILD_BUG to detect if it is + * unexpectedly used. + */ +#define BUILD_BUG() \ + do { \ + extern void __build_bug_failed(void) \ + __linktime_error("BUILD_BUG failed"); \ + __build_bug_failed(); \ + } while (0) + #endif /* __CHECKER__ */ /* Trap pasters of __FUNCTION__ at compile-time */ From c0a32fc5a2e470d0b02597b23ad79a317735253e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:28 -0800 Subject: [PATCH 016/124] mm: more intensive memory corruption debugging With CONFIG_DEBUG_PAGEALLOC configured, the CPU will generate an exception on access (read,write) to an unallocated page, which permits us to catch code which corrupts memory. However the kernel is trying to maximise memory usage, hence there are usually few free pages in the system and buggy code usually corrupts some crucial data. This patch changes the buddy allocator to keep more free/protected pages and to interlace free/protected and allocated pages to increase the probability of catching corruption. When the kernel is compiled with CONFIG_DEBUG_PAGEALLOC, debug_guardpage_minorder defines the minimum order used by the page allocator to grant a request. The requested size will be returned with the remaining pages used as guard pages. The default value of debug_guardpage_minorder is zero: no change from current behaviour. [akpm@linux-foundation.org: tweak documentation, s/flg/flag/] Signed-off-by: Stanislaw Gruszka Cc: Mel Gorman Cc: Andrea Arcangeli Cc: "Rafael J. Wysocki" Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 19 ++++++++ include/linux/mm.h | 17 +++++++ include/linux/page-debug-flags.h | 4 +- mm/Kconfig.debug | 5 ++ mm/page_alloc.c | 75 ++++++++++++++++++++++++++--- 5 files changed, 113 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 7b2e5c5eefa602..7ed7030e77226c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -623,6 +623,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted. no_debug_objects [KNL] Disable object debugging + debug_guardpage_minorder= + [KNL] When CONFIG_DEBUG_PAGEALLOC is set, this + parameter allows control of the order of pages that will + be intentionally kept free (and hence protected) by the + buddy allocator. Bigger value increase the probability + of catching random memory corruption, but reduce the + amount of memory for normal system use. The maximum + possible value is MAX_ORDER/2. Setting this parameter + to 1 or 2 should be enough to identify most random + memory corruption problems caused by bugs in kernel or + driver code when a CPU writes to (or reads from) a + random memory location. Note that there exists a class + of memory corruptions problems caused by buggy H/W or + F/W or by drivers badly programing DMA (basically when + memory is written at bus level and the CPU MMU is + bypassed) which are not detectable by + CONFIG_DEBUG_PAGEALLOC, hence this option will not help + tracking down these problems. + debugpat [X86] Enable PAT debugging decnet.addr= [HW,NET] diff --git a/include/linux/mm.h b/include/linux/mm.h index 5d9b4c9813bdb7..5568553a41fd60 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1618,5 +1618,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src, unsigned int pages_per_huge_page); #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ +#ifdef CONFIG_DEBUG_PAGEALLOC +extern unsigned int _debug_guardpage_minorder; + +static inline unsigned int debug_guardpage_minorder(void) +{ + return _debug_guardpage_minorder; +} + +static inline bool page_is_guard(struct page *page) +{ + return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline unsigned int debug_guardpage_minorder(void) { return 0; } +static inline bool page_is_guard(struct page *page) { return false; } +#endif /* CONFIG_DEBUG_PAGEALLOC */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h index b0638fd91e92fd..22691f614043df 100644 --- a/include/linux/page-debug-flags.h +++ b/include/linux/page-debug-flags.h @@ -13,6 +13,7 @@ enum page_debug_flags { PAGE_DEBUG_FLAG_POISON, /* Page is poisoned */ + PAGE_DEBUG_FLAG_GUARD, }; /* @@ -21,7 +22,8 @@ enum page_debug_flags { */ #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS -#if !defined(CONFIG_PAGE_POISONING) \ +#if !defined(CONFIG_PAGE_POISONING) && \ + !defined(CONFIG_PAGE_GUARD) \ /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */ #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features! #endif diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 8b1a477162dc07..4b2443254de260 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC depends on !KMEMCHECK select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC + select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types @@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS config PAGE_POISONING bool select WANT_PAGE_DEBUG_FLAGS + +config PAGE_GUARD + bool + select WANT_PAGE_DEBUG_FLAGS diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3cba4b67203f03..93baebcc06f381 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include @@ -388,6 +389,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) clear_highpage(page + i); } +#ifdef CONFIG_DEBUG_PAGEALLOC +unsigned int _debug_guardpage_minorder; + +static int __init debug_guardpage_minorder_setup(char *buf) +{ + unsigned long res; + + if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { + printk(KERN_ERR "Bad debug_guardpage_minorder value\n"); + return 0; + } + _debug_guardpage_minorder = res; + printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res); + return 0; +} +__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup); + +static inline void set_page_guard_flag(struct page *page) +{ + __set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} + +static inline void clear_page_guard_flag(struct page *page) +{ + __clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags); +} +#else +static inline void set_page_guard_flag(struct page *page) { } +static inline void clear_page_guard_flag(struct page *page) { } +#endif + static inline void set_page_order(struct page *page, int order) { set_page_private(page, order); @@ -445,6 +477,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (page_zone_id(page) != page_zone_id(buddy)) return 0; + if (page_is_guard(buddy) && page_order(buddy) == order) { + VM_BUG_ON(page_count(buddy) != 0); + return 1; + } + if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON(page_count(buddy) != 0); return 1; @@ -501,11 +538,19 @@ static inline void __free_one_page(struct page *page, buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; - - /* Our buddy is free, merge with it and move up one order. */ - list_del(&buddy->lru); - zone->free_area[order].nr_free--; - rmv_page_order(buddy); + /* + * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, + * merge with it and move up one order. + */ + if (page_is_guard(buddy)) { + clear_page_guard_flag(buddy); + set_page_private(page, 0); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); + } else { + list_del(&buddy->lru); + zone->free_area[order].nr_free--; + rmv_page_order(buddy); + } combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; @@ -731,6 +776,23 @@ static inline void expand(struct zone *zone, struct page *page, high--; size >>= 1; VM_BUG_ON(bad_range(zone, &page[size])); + +#ifdef CONFIG_DEBUG_PAGEALLOC + if (high < debug_guardpage_minorder()) { + /* + * Mark as guard pages (or page), that will allow to + * merge back to allocator when buddy will be freed. + * Corresponding page table entries will not be touched, + * pages will stay not present in virtual address space + */ + INIT_LIST_HEAD(&page[size].lru); + set_page_guard_flag(&page[size]); + set_page_private(&page[size], high); + /* Guard pages are not available for any usage */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high)); + continue; + } +#endif list_add(&page[size].lru, &area->free_list[migratetype]); area->nr_free++; set_page_order(&page[size], high); @@ -1754,7 +1816,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { unsigned int filter = SHOW_MEM_FILTER_NODES; - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || + debug_guardpage_minorder() > 0) return; /* From c6968e73b90c2a2fb9a32d4bad249f8f70f70125 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:31 -0800 Subject: [PATCH 017/124] PM/Hibernate: do not count debug pages as savable When debugging with CONFIG_DEBUG_PAGEALLOC and debug_guardpage_minorder > 0, we have lot of free pages that are not marked so. Snapshot code account them as savable, what cause hibernate memory preallocation failure. It is pretty hard to make hibernate allocation succeed with debug_guardpage_minorder=1. This change at least make it possible when system has relatively big amount of RAM. Signed-off-by: Stanislaw Gruszka Acked-by: Rafael J. Wysocki Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index cbe2c14413927c..1cf88900ec4fdc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -858,6 +858,9 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) PageReserved(page)) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } @@ -920,6 +923,9 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn) && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; + if (page_is_guard(page)) + return NULL; + return page; } From fc8d8620d39dbbaf412b1b9247d77d196d92adb9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Tue, 10 Jan 2012 15:07:32 -0800 Subject: [PATCH 018/124] slub: min order when debug_guardpage_minorder > 0 Disable slub debug facilities and allocate slabs at minimal order when debug_guardpage_minorder > 0 to increase probability to catch random memory corruption by cpu exception. Signed-off-by: Stanislaw Gruszka Cc: "Rafael J. Wysocki" Cc: Andrea Arcangeli Acked-by: Christoph Lameter Cc: Mel Gorman Cc: Stanislaw Gruszka Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 025f6ac5156973..d99acbf14e0179 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3654,6 +3654,9 @@ void __init kmem_cache_init(void) struct kmem_cache *temp_kmem_cache_node; unsigned long kmalloc_size; + if (debug_guardpage_minorder()) + slub_max_order = 0; + kmem_size = offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *); From ad8a1b558e6c76fb53901956d3c8f29b82a4ccfa Mon Sep 17 00:00:00 2001 From: Shawn Bohrer Date: Tue, 10 Jan 2012 15:07:35 -0800 Subject: [PATCH 019/124] fadvise: only initiate writeback for specified range with FADV_DONTNEED Previously POSIX_FADV_DONTNEED would start writeback for the entire file when the bdi was not write congested. This negatively impacts performance if the file contains dirty pages outside of the requested range. This change uses __filemap_fdatawrite_range() to only initiate writeback for the requested range. Signed-off-by: Shawn Bohrer Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index 8d723c9e8b75b3..469491e0af79fe 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -117,7 +117,8 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) break; case POSIX_FADV_DONTNEED: if (!bdi_write_congested(mapping->backing_dev_info)) - filemap_flush(mapping); + __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); /* First and last FULL page! */ start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT; From f6d7e0cb3ecc248e98fa11d83253f6174bd7e085 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 10 Jan 2012 15:07:38 -0800 Subject: [PATCH 020/124] mm, debug: test for online nid when allocating on single node Calling alloc_pages_exact_node() means the allocation only passes the zonelist of a single node into the page allocator. If that node isn't online, it's zonelist may never have been initialized causing a strange oops that may not immediately be clear. I recently debugged an issue where node 0 wasn't online and an allocator was passing 0 to alloc_pages_exact_node() and it resulted in a NULL pointer on zonelist->_zoneref. If CONFIG_DEBUG_VM is enabled, though, it would be nice to catch this a bit earlier. Signed-off-by: David Rientjes Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 91812df1351af6..66f172fdf5fe91 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -313,7 +313,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, unsigned int order) { - VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid)); return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } From 25bd91bd27820d5971258cecd1c0e64b0e485144 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:07:40 -0800 Subject: [PATCH 021/124] vmscan: add task name to warn_scan_unevictable() messages If we need to know a usecase, caller program name is critical important. Show it. Signed-off-by: KOSAKI Motohiro Acked-by: Johannes Weiner David Rientjes Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3d571df41c7917..974162c80edb3d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3436,9 +3436,10 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) static void warn_scan_unevictable_pages(void) { printk_once(KERN_WARNING - "The scan_unevictable_pages sysctl/node-interface has been " + "%s: The scan_unevictable_pages sysctl/node-interface has been " "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n"); + "one, please send an email to linux-mm@kvack.org.\n", + current->comm); } /* From ab8fabd46f811d5153d8a0cd2fac9a0d41fb593d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:42 -0800 Subject: [PATCH 022/124] mm: exclude reserved pages from dirtyable memory Per-zone dirty limits try to distribute page cache pages allocated for writing across zones in proportion to the individual zone sizes, to reduce the likelihood of reclaim having to write back individual pages from the LRU lists in order to make progress. This patch: The amount of dirtyable pages should not include the full number of free pages: there is a number of reserved pages that the page allocator and kswapd always try to keep free. The closer (reclaimable pages - dirty pages) is to the number of reserved pages, the more likely it becomes for reclaim to run into dirty pages: +----------+ --- | anon | | +----------+ | | | | | | -- dirty limit new -- flusher new | file | | | | | | | | | -- dirty limit old -- flusher old | | | +----------+ --- reclaim | reserved | +----------+ | kernel | +----------+ This patch introduces a per-zone dirty reserve that takes both the lowmem reserve as well as the high watermark of the zone into account, and a global sum of those per-zone values that is subtracted from the global amount of dirtyable pages. The lowmem reserve is unavailable to page cache allocations and kswapd tries to keep the high watermark free. We don't want to end up in a situation where reclaim has to clean pages in order to balance zones. Not treating reserved pages as dirtyable on a global level is only a conceptual fix. In reality, dirty pages are not distributed equally across zones and reclaim runs into dirty pages on a regular basis. But it is important to get this right before tackling the problem on a per-zone level, where the distance between reclaim and the dirty pages is mostly much smaller in absolute numbers. [akpm@linux-foundation.org: fix highmem build] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Michal Hocko Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ++++++ include/linux/swap.h | 1 + mm/page-writeback.c | 5 +++-- mm/page_alloc.c | 19 +++++++++++++++++++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ac040f1936963..ca6ca92418a6ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -317,6 +317,12 @@ struct zone { */ unsigned long lowmem_reserve[MAX_NR_ZONES]; + /* + * This is a per-zone reserve of pages that should not be + * considered dirtyable memory. + */ + unsigned long dirty_balance_reserve; + #ifdef CONFIG_NUMA int node; /* diff --git a/include/linux/swap.h b/include/linux/swap.h index 1e22e126d2acc7..06061a7f8e6913 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -207,6 +207,7 @@ struct swap_list_t { /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; extern unsigned long totalreserve_pages; +extern unsigned long dirty_balance_reserve; extern unsigned int nr_free_buffer_pages(void); extern unsigned int nr_free_pagecache_pages(void); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c081bf62202b0a..9ab6de82d8e692 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -157,7 +157,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); + zone_reclaimable_pages(z) - z->dirty_balance_reserve; } /* * Make sure that the number of highmem pages is never larger @@ -181,7 +181,8 @@ static unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - + dirty_balance_reserve; if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 93baebcc06f381..2cb9eb71e282f2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -97,6 +97,14 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; +/* + * When calculating the number of globally allowed dirty pages, there + * is a certain number of per-zone reserves that should not be + * considered dirtyable memory. This is the sum of those reserves + * over all existing zones that contribute dirtyable memory. + */ +unsigned long dirty_balance_reserve __read_mostly; + int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; @@ -4822,8 +4830,19 @@ static void calculate_totalreserve_pages(void) if (max > zone->present_pages) max = zone->present_pages; reserve_pages += max; + /* + * Lowmem reserves are not available to + * GFP_HIGHUSER page cache allocations and + * kswapd tries to balance zones to their high + * watermark. As a result, neither should be + * regarded as dirtyable memory, to prevent a + * situation where reclaim has to clean pages + * in order to balance the zones. + */ + zone->dirty_balance_reserve = max; } } + dirty_balance_reserve = reserve_pages; totalreserve_pages = reserve_pages; } From ccafa2879fb8d13b8031337a8743eac4189e5d6e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:44 -0800 Subject: [PATCH 023/124] mm: writeback: cleanups in preparation for per-zone dirty limits The next patch will introduce per-zone dirty limiting functions in addition to the traditional global dirty limiting. Rename determine_dirtyable_memory() to global_dirtyable_memory() before adding the zone-specific version, and fix up its documentation. Also, move the functions to determine the dirtyable memory and the function to calculate the dirty limit based on that together so that their relationship is more apparent and that they can be commented on as a group. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 93 +++++++++++++++++++++++---------------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9ab6de82d8e692..433fa990fe8b15 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -146,6 +146,7 @@ static struct prop_descriptor vm_completions; * We make sure that the background writeout level is below the adjusted * clamping level. */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -172,12 +173,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) } /** - * determine_dirtyable_memory - amount of memory that may be used + * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. */ -static unsigned long determine_dirtyable_memory(void) +unsigned long global_dirtyable_memory(void) { unsigned long x; @@ -190,6 +191,47 @@ static unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + unsigned long background; + unsigned long dirty; + unsigned long uninitialized_var(available_memory); + struct task_struct *tsk; + + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = global_dirtyable_memory(); + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + /* * couple the period to the dirty_ratio: * @@ -202,7 +244,7 @@ static int calc_period_shift(void) if (vm_dirty_bytes) dirty_total = vm_dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -362,47 +404,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) return max(thresh, global_dirty_limit); } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds - * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) -{ - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); - struct task_struct *tsk; - - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = determine_dirtyable_memory(); - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else - dirty = (vm_dirty_ratio * available_memory) / 100; - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); -} - /** * bdi_dirty_limit - @bdi's share of dirty throttling threshold * @bdi: the backing_dev_info to query From a756cf5908530e8b40bdf569eb48b40139e8d7fd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:49 -0800 Subject: [PATCH 024/124] mm: try to distribute dirty pages fairly across zones The maximum number of dirty pages that exist in the system at any time is determined by a number of pages considered dirtyable and a user-configured percentage of those, or an absolute number in bytes. This number of dirtyable pages is the sum of memory provided by all the zones in the system minus their lowmem reserves and high watermarks, so that the system can retain a healthy number of free pages without having to reclaim dirty pages. But there is a flaw in that we have a zoned page allocator which does not care about the global state but rather the state of individual memory zones. And right now there is nothing that prevents one zone from filling up with dirty pages while other zones are spared, which frequently leads to situations where kswapd, in order to restore the watermark of free pages, does indeed have to write pages from that zone's LRU list. This can interfere so badly with IO from the flusher threads that major filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim already, taking away the VM's only possibility to keep such a zone balanced, aside from hoping the flushers will soon clean pages from that zone. Enter per-zone dirty limits. They are to a zone's dirtyable memory what the global limit is to the global amount of dirtyable memory, and try to make sure that no single zone receives more than its fair share of the globally allowed dirty pages in the first place. As the number of pages considered dirtyable excludes the zones' lowmem reserves and high watermarks, the maximum number of dirty pages in a zone is such that the zone can always be balanced without requiring page cleaning. As this is a placement decision in the page allocator and pages are dirtied only after the allocation, this patch allows allocators to pass __GFP_WRITE when they know in advance that the page will be written to and become dirty soon. The page allocator will then attempt to allocate from the first zone of the zonelist - which on NUMA is determined by the task's NUMA memory policy - that has not exceeded its dirty limit. At first glance, it would appear that the diversion to lower zones can increase pressure on them, but this is not the case. With a full high zone, allocations will be diverted to lower zones eventually, so it is more of a shift in timing of the lower zone allocations. Workloads that previously could fit their dirty pages completely in the higher zone may be forced to allocate from lower zones, but the amount of pages that "spill over" are limited themselves by the lower zones' dirty constraints, and thus unlikely to become a problem. For now, the problem of unfair dirty page distribution remains for NUMA configurations where the zones allowed for allocation are in sum not big enough to trigger the global dirty limits, wake up the flusher threads and remedy the situation. Because of this, an allocation that could not succeed on any of the considered zones is allowed to ignore the dirty limits before going into direct reclaim or even failing the allocation, until a future patch changes the global dirty throttling and flusher thread activation so that they take individual zone states into account. Test results 15M DMA + 3246M DMA32 + 504 Normal = 3765M memory 40% dirty ratio 16G USB thumb drive 10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15)) seconds nr_vmscan_write (stddev) min| median| max xfs vanilla: 549.747( 3.492) 0.000| 0.000| 0.000 patched: 550.996( 3.802) 0.000| 0.000| 0.000 fuse-ntfs vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000 patched: 558.049(17.914) 0.000| 0.000| 43.000 btrfs vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000 patched: 563.365(11.368) 0.000| 0.000| 1362.000 ext4 vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000 patched: 568.806(17.496) 0.000| 0.000| 0.000 Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Tested-by: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 +- include/linux/writeback.h | 1 + mm/page-writeback.c | 82 +++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 29 ++++++++++++++ 4 files changed, 115 insertions(+), 1 deletion(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 66f172fdf5fe91..581e74b7df95e3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,6 +36,7 @@ struct vm_area_struct; #endif #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u +#define ___GFP_WRITE 0x1000000u /* * GFP bitmasks.. @@ -85,6 +86,7 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* * This may seem redundant, but it's a way of annotating false positives vs. @@ -92,7 +94,7 @@ struct vm_area_struct; */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 24 /* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 25 /* Room for N __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 34a005515fef1c..6dff47304971a5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -124,6 +124,7 @@ void laptop_mode_timer_fn(unsigned long data); static inline void laptop_sync_completion(void) { } #endif void throttle_vm_writeout(gfp_t gfp_mask); +bool zone_dirty_ok(struct zone *zone); extern unsigned long global_dirty_limit; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 433fa990fe8b15..5cdd4f2b0c9d8b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -147,6 +147,24 @@ static struct prop_descriptor vm_completions; * clamping level. */ +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -232,6 +250,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) trace_global_dirty_state(background, dirty); } +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + /* + * The effective global number of dirtyable pages may exclude + * highmem as a big-picture measure to keep the ratio between + * dirty memory and lowmem reasonable. + * + * But this function is purely about the individual zone and a + * highmem zone can hold its share of dirty pages, so we don't + * care about vm_highmem_is_dirtyable here. + */ + return zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone) - + zone->dirty_balance_reserve; +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. + */ +static unsigned long zone_dirty_limit(struct zone *zone) +{ + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. + */ +bool zone_dirty_ok(struct zone *zone) +{ + unsigned long limit = zone_dirty_limit(zone); + + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; +} + /* * couple the period to the dirty_ratio: * diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2cb9eb71e282f2..4f95bcf0f2b17a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1735,6 +1735,35 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; + /* + * When allocating a page cache page for writing, we + * want to get it from a zone that is within its dirty + * limit, such that no single zone holds more than its + * proportional share of globally allowed dirty pages. + * The dirty limits take into account the zone's + * lowmem reserves and high watermark so that kswapd + * should be able to balance it without having to + * write pages from its LRU list. + * + * This may look like it could increase pressure on + * lower zones by failing allocations in higher zones + * before they are full. But the pages that do spill + * over are limited as the lower zones are protected + * by this very same mechanism. It should not become + * a practical burden to them. + * + * XXX: For now, allow allocations to potentially + * exceed the per-zone dirty limit in the slowpath + * (ALLOC_WMARK_LOW unset) before going into reclaim, + * which is important when on a NUMA setup the allowed + * zones are together not big enough to reach the + * global limit. The proper fix for these situations + * will require awareness of zones in the + * dirty-throttling and the flusher threads. + */ + if ((alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + goto this_zone_full; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { From 0faa70cb0180d45a06208e54b552a538aabb8a30 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:53 -0800 Subject: [PATCH 025/124] mm: filemap: pass __GFP_WRITE from grab_cache_page_write_begin() Tell the page allocator that pages allocated through grab_cache_page_write_begin() are expected to become dirty soon. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index a0701e6eec107a..c4ee2e918bea1c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2351,8 +2351,11 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { int status; + gfp_t gfp_mask; struct page *page; gfp_t gfp_notmask = 0; + + gfp_mask = mapping_gfp_mask(mapping) | __GFP_WRITE; if (flags & AOP_FLAG_NOFS) gfp_notmask = __GFP_FS; repeat: @@ -2360,7 +2363,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, if (page) goto found; - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); + page = __page_cache_alloc(gfp_mask & ~gfp_notmask); if (!page) return NULL; status = add_to_page_cache_lru(page, mapping, index, From e3a41a5ba9c2ab988b9f1442925109dca2382fd9 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:55 -0800 Subject: [PATCH 026/124] btrfs: pass __GFP_WRITE for buffered write page allocations Tell the page allocator that pages allocated for a buffered write are expected to become dirty soon. Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Acked-by: Mel Gorman Cc: Minchan Kim Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 97fbe939c050dc..20375e6691c390 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1081,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file, again: for (i = 0; i < num_pages; i++) { pages[i] = find_or_create_page(inode->i_mapping, index + i, - mask); + mask | __GFP_WRITE); if (!pages[i]) { faili = i - 1; err = -ENOMEM; From 31b8384a555d94c78e2ea2284a323cb985441f60 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:07:59 -0800 Subject: [PATCH 027/124] mm: compaction: push isolate search base of compact control one pfn ahead After isolated the current pfn will no longer be scanned and isolated if the next round is necessary, so push the isolate_migratepages search base of the given compact_control one step ahead. Signed-off-by: Hillf Danton Reviewed-by: Andrea Arcangeli Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 1253d7ac332b50..e6670c34eb4950 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -365,8 +365,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, nr_isolated++; /* Avoid isolating too much */ - if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { + ++low_pfn; break; + } } acct_isolated(zone, cc); From 9571a982903bf9dcbca2479fd3e7dafd2211ecf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 10 Jan 2012 15:08:00 -0800 Subject: [PATCH 028/124] bootmem: micro optimize freeing pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first entry of bdata->node_bootmem_map holds the data for bdata->node_min_pfn up to bdata->node_min_pfn + BITS_PER_LONG - 1. So the test for freeing all pages of a single map entry can be slightly relaxed. Moreover use DIV_ROUND_UP in another place instead of open coding it. Signed-off-by: Uwe Kleine-König Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 1a77012ecdb3c5..3e6f152f117e87 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -56,7 +56,7 @@ early_param("bootmem_debug", bootmem_debug_setup); static unsigned long __init bootmap_bytes(unsigned long pages) { - unsigned long bytes = (pages + 7) / 8; + unsigned long bytes = DIV_ROUND_UP(pages, 8); return ALIGN(bytes, sizeof(long)); } @@ -197,7 +197,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - if (aligned && vec == ~0UL && start + BITS_PER_LONG < end) { + if (aligned && vec == ~0UL && start + BITS_PER_LONG <= end) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); From df0a6daa01fa3856c08f4274d4f21a8092caa480 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2012 15:08:02 -0800 Subject: [PATCH 029/124] mm: fix off-by-two in __zone_watermark_ok() Commit 88f5acf88ae6 ("mm: page allocator: adjust the per-cpu counter threshold when memory is low") changed the form how free_pages is calculated but it forgot that we used to do free_pages - ((1 << order) - 1) so we ended up with off-by-two when calculating free_pages. Reported-by: Wang Sheng-Hui Signed-off-by: Michal Hocko Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4f95bcf0f2b17a..59153da58c69bc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1525,7 +1525,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, long min = mark; int o; - free_pages -= (1 << order) + 1; + free_pages -= (1 << order) - 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) From 948f017b093a9baac23855fcd920d3a970b71bb6 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 10 Jan 2012 15:08:05 -0800 Subject: [PATCH 030/124] mremap: enforce rmap src/dst vma ordering in case of vma_merge() succeeding in copy_vma() migrate was doing an rmap_walk with speculative lock-less access on pagetables. That could lead it to not serializing properly against mremap PT locks. But a second problem remains in the order of vmas in the same_anon_vma list used by the rmap_walk. If vma_merge succeeds in copy_vma, the src vma could be placed after the dst vma in the same_anon_vma list. That could still lead to migrate missing some pte. This patch adds an anon_vma_moveto_tail() function to force the dst vma at the end of the list before mremap starts to solve the problem. If the mremap is very large and there are a lots of parents or childs sharing the anon_vma root lock, this should still scale better than taking the anon_vma root lock around every pte copy practically for the whole duration of mremap. Update: Hugh noticed special care is needed in the error path where move_page_tables goes in the reverse direction, a second anon_vma_moveto_tail() call is needed in the error path. This program exercises the anon_vma_moveto_tail: === int main() { static struct timeval oldstamp, newstamp; long diffsec; char *p, *p2, *p3, *p4; if (posix_memalign((void **)&p, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p2, 2*1024*1024, SIZE)) perror("memalign"), exit(1); if (posix_memalign((void **)&p3, 2*1024*1024, SIZE)) perror("memalign"), exit(1); memset(p, 0xff, SIZE); printf("%p\n", p); memset(p2, 0xff, SIZE); memset(p3, 0x77, 4096); if (memcmp(p, p2, SIZE)) printf("error\n"); p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3); if (p4 != p3) perror("mremap"), exit(1); p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2); if (p4 != p+SIZE/2) perror("mremap"), exit(1); if (memcmp(p, p2, SIZE)) printf("error\n"); printf("ok\n"); return 0; } === $ perf probe -a anon_vma_moveto_tail Add new event: probe:anon_vma_moveto_tail (on anon_vma_moveto_tail) You can now use it on all perf tools, such as: perf record -e probe:anon_vma_moveto_tail -aR sleep 1 $ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail 0x7f2ca2800000 ok [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ] $ perf report --stdio 100.00% anon_vma_moveto [kernel.kallsyms] [k] anon_vma_moveto_tail Signed-off-by: Andrea Arcangeli Reported-by: Nai Xia Acked-by: Mel Gorman Cc: Hugh Dickins Cc: Pawel Sikora Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 1 + mm/mmap.c | 24 ++++++++++++++++++++--- mm/mremap.c | 9 +++++++++ mm/rmap.c | 45 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 3 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 2148b122779b5a..1afb9954bbf125 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -120,6 +120,7 @@ void anon_vma_init(void); /* create anon_vma_cachep */ int anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); +void anon_vma_moveto_tail(struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); void __anon_vma_link(struct vm_area_struct *); diff --git a/mm/mmap.c b/mm/mmap.c index eae90af60ea62e..adea3b8880e3ff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; struct mempolicy *pol; + bool faulted_in_anon_vma = true; /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging. */ - if (!vma->vm_file && !vma->anon_vma) + if (unlikely(!vma->vm_file && !vma->anon_vma)) { pgoff = addr >> PAGE_SHIFT; + faulted_in_anon_vma = false; + } find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, /* * Source vma may have been merged into new_vma */ - if (vma_start >= new_vma->vm_start && - vma_start < new_vma->vm_end) + if (unlikely(vma_start >= new_vma->vm_start && + vma_start < new_vma->vm_end)) { + /* + * The only way we can get a vma_merge with + * self during an mremap is if the vma hasn't + * been faulted in yet and we were allowed to + * reset the dst vma->vm_pgoff to the + * destination address of the mremap to allow + * the merge to happen. mremap must change the + * vm_pgoff linearity between src and dst vmas + * (in turn preventing a vma_merge) to be + * safe. It is only safe to keep the vm_pgoff + * linear if there are no pages mapped yet. + */ + VM_BUG_ON(faulted_in_anon_vma); *vmap = new_vma; + } else + anon_vma_moveto_tail(new_vma); } else { new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (new_vma) { diff --git a/mm/mremap.c b/mm/mremap.c index d6959cb4df58f1..87bb8393e7d238 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -220,6 +220,15 @@ static unsigned long move_vma(struct vm_area_struct *vma, moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len); if (moved_len < old_len) { + /* + * Before moving the page tables from the new vma to + * the old vma, we need to be sure the old vma is + * queued after new vma in the same_anon_vma list to + * prevent SMP races with rmap_walk (that could lead + * rmap_walk to miss some page table). + */ + anon_vma_moveto_tail(vma); + /* * On error, move entries back from new area to old, * which will succeed since page tables still there, diff --git a/mm/rmap.c b/mm/rmap.c index a4fd3680038be4..a2e5ce1fa08142 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -271,6 +271,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) return -ENOMEM; } +/* + * Some rmap walk that needs to find all ptes/hugepmds without false + * negatives (like migrate and split_huge_page) running concurrent + * with operations that copy or move pagetables (like mremap() and + * fork()) to be safe. They depend on the anon_vma "same_anon_vma" + * list to be in a certain order: the dst_vma must be placed after the + * src_vma in the list. This is always guaranteed by fork() but + * mremap() needs to call this function to enforce it in case the + * dst_vma isn't newly allocated and chained with the anon_vma_clone() + * function but just an extension of a pre-existing vma through + * vma_merge. + * + * NOTE: the same_anon_vma list can still be changed by other + * processes while mremap runs because mremap doesn't hold the + * anon_vma mutex to prevent modifications to the list while it + * runs. All we need to enforce is that the relative order of this + * process vmas isn't changing (we don't care about other vmas + * order). Each vma corresponds to an anon_vma_chain structure so + * there's no risk that other processes calling anon_vma_moveto_tail() + * and changing the same_anon_vma list under mremap() will screw with + * the relative order of this process vmas in the list, because we + * they can't alter the order of any vma that belongs to this + * process. And there can't be another anon_vma_moveto_tail() running + * concurrently with mremap() coming from this process because we hold + * the mmap_sem for the whole mremap(). fork() ordering dependency + * also shouldn't be affected because fork() only cares that the + * parent vmas are placed in the list before the child vmas and + * anon_vma_moveto_tail() won't reorder vmas from either the fork() + * parent or child. + */ +void anon_vma_moveto_tail(struct vm_area_struct *dst) +{ + struct anon_vma_chain *pavc; + struct anon_vma *root = NULL; + + list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) { + struct anon_vma *anon_vma = pavc->anon_vma; + VM_BUG_ON(pavc->vma != dst); + root = lock_anon_vma_root(root, anon_vma); + list_del(&pavc->same_anon_vma); + list_add_tail(&pavc->same_anon_vma, &anon_vma->head); + } + unlock_anon_vma_root(root); +} + /* * Attach vma to its own anon_vma, as well as to the anon_vmas that * the corresponding VMA in the parent process is attached to. From 6bd4837de96e7d9f9bf33e59117c24fc230862ac Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:08:07 -0800 Subject: [PATCH 031/124] mm: simplify find_vma_prev() commit 297c5eee37 ("mm: make the vma list be doubly linked") added the vm_prev member to vm_area_struct. We can simplify find_vma_prev() by using it. Also, this change helps to improve page fault performance because it has stronger locality of reference. Signed-off-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Peter Zijlstra Cc: Shaohua Li Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 36 ++++++++---------------------------- 1 file changed, 8 insertions(+), 28 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index adea3b8880e3ff..3f758c7f4c815c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1603,39 +1603,19 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -/* Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ +/* + * Same as find_vma, but also return a pointer to the previous VMA in *pprev. + * Note: pprev is set to NULL when return value is NULL. + */ struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr, struct vm_area_struct **pprev) { - struct vm_area_struct *vma = NULL, *prev = NULL; - struct rb_node *rb_node; - if (!mm) - goto out; - - /* Guard against addr being lower than the first VMA */ - vma = mm->mmap; - - /* Go through the RB tree quickly. */ - rb_node = mm->mm_rb.rb_node; - - while (rb_node) { - struct vm_area_struct *vma_tmp; - vma_tmp = rb_entry(rb_node, struct vm_area_struct, vm_rb); - - if (addr < vma_tmp->vm_end) { - rb_node = rb_node->rb_left; - } else { - prev = vma_tmp; - if (!prev->vm_next || (addr < prev->vm_next->vm_end)) - break; - rb_node = rb_node->rb_right; - } - } + struct vm_area_struct *vma; -out: - *pprev = prev; - return prev ? prev->vm_next : vma; + vma = find_vma(mm, addr); + *pprev = vma ? vma->vm_prev : NULL; + return vma; } /* From 43d2b113241d6797b890318767e0af78e313414b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Jan 2012 15:08:09 -0800 Subject: [PATCH 032/124] tracepoint: add tracepoints for debugging oom_score_adj oom_score_adj is used for guarding processes from OOM-Killer. One of problem is that it's inherited at fork(). When a daemon set oom_score_adj and make children, it's hard to know where the value is set. This patch adds some tracepoints useful for debugging. This patch adds 3 trace points. - creating new task - renaming a task (exec) - set oom_score_adj To debug, users need to enable some trace pointer. Maybe filtering is useful as # EVENT=/sys/kernel/debug/tracing/events/task/ # echo "oom_score_adj != 0" > $EVENT/task_newtask/filter # echo "oom_score_adj != 0" > $EVENT/task_rename/filter # echo 1 > $EVENT/enable # EVENT=/sys/kernel/debug/tracing/events/oom/ # echo 1 > $EVENT/enable output will be like this. # grep oom /sys/kernel/debug/tracing/trace bash-7699 [007] d..3 5140.744510: oom_score_adj_update: pid=7699 comm=bash oom_score_adj=-1000 bash-7699 [007] ...1 5151.818022: task_newtask: pid=7729 comm=bash clone_flags=1200011 oom_score_adj=-1000 ls-7729 [003] ...2 5151.818504: task_rename: pid=7729 oldcomm=bash newcomm=ls oom_score_adj=-1000 bash-7699 [002] ...1 5175.701468: task_newtask: pid=7730 comm=bash clone_flags=1200011 oom_score_adj=-1000 grep-7730 [007] ...2 5175.701993: task_rename: pid=7730 oldcomm=bash newcomm=grep oom_score_adj=-1000 Signed-off-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 4 +++ fs/proc/base.c | 3 ++ include/trace/events/oom.h | 33 ++++++++++++++++++++ include/trace/events/task.h | 61 +++++++++++++++++++++++++++++++++++++ kernel/fork.c | 6 ++++ mm/oom_kill.c | 6 ++++ 6 files changed, 113 insertions(+) create mode 100644 include/trace/events/oom.h create mode 100644 include/trace/events/task.h diff --git a/fs/exec.c b/fs/exec.c index 3f64b9f26e7df5..aeb135c7ff5c0c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -59,6 +59,8 @@ #include #include #include + +#include #include "internal.h" int core_uses_pid; @@ -1054,6 +1056,8 @@ void set_task_comm(struct task_struct *tsk, char *buf) { task_lock(tsk); + trace_task_rename(tsk, buf); + /* * Threads may access current->comm without holding * the task lock, so write the string carefully. diff --git a/fs/proc/base.c b/fs/proc/base.c index a1dddda999f208..1aab5fe05a1b12 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -86,6 +86,7 @@ #ifdef CONFIG_HARDWALL #include #endif +#include #include "internal.h" /* NOTE: @@ -1010,6 +1011,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, else task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE; + trace_oom_score_adj_update(task); err_sighand: unlock_task_sighand(task, &flags); err_task_lock: @@ -1097,6 +1099,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, task->signal->oom_score_adj = oom_score_adj; if (has_capability_noaudit(current, CAP_SYS_RESOURCE)) task->signal->oom_score_adj_min = oom_score_adj; + trace_oom_score_adj_update(task); /* * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is * always attainable. diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h new file mode 100644 index 00000000000000..dd4ba3b92002aa --- /dev/null +++ b/include/trace/events/oom.h @@ -0,0 +1,33 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM oom + +#if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_OOM_H +#include + +TRACE_EVENT(oom_score_adj_update, + + TP_PROTO(struct task_struct *task), + + TP_ARGS(task), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN ) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s oom_score_adj=%d", + __entry->pid, __entry->comm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/task.h b/include/trace/events/task.h new file mode 100644 index 00000000000000..b53add02e929ef --- /dev/null +++ b/include/trace/events/task.h @@ -0,0 +1,61 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM task + +#if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASK_H +#include + +TRACE_EVENT(task_newtask, + + TP_PROTO(struct task_struct *task, unsigned long clone_flags), + + TP_ARGS(task, clone_flags), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, comm, TASK_COMM_LEN) + __field( unsigned long, clone_flags) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(__entry->comm, task->comm, TASK_COMM_LEN); + __entry->clone_flags = clone_flags; + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%d", + __entry->pid, __entry->comm, + __entry->clone_flags, __entry->oom_score_adj) +); + +TRACE_EVENT(task_rename, + + TP_PROTO(struct task_struct *task, char *comm), + + TP_ARGS(task, comm), + + TP_STRUCT__entry( + __field( pid_t, pid) + __array( char, oldcomm, TASK_COMM_LEN) + __array( char, newcomm, TASK_COMM_LEN) + __field( int, oom_score_adj) + ), + + TP_fast_assign( + __entry->pid = task->pid; + memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); + memcpy(entry->newcomm, comm, TASK_COMM_LEN); + __entry->oom_score_adj = task->signal->oom_score_adj; + ), + + TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%d", + __entry->pid, __entry->oldcomm, + __entry->newcomm, __entry->oom_score_adj) +); + +#endif + +/* This part must be outside protection */ +#include diff --git a/kernel/fork.c b/kernel/fork.c index b00711ce7c1322..5e1391b5ade058 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -76,6 +76,9 @@ #include +#define CREATE_TRACE_POINTS +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -1370,6 +1373,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if (clone_flags & CLONE_THREAD) threadgroup_change_end(current); perf_event_fork(p); + + trace_task_newtask(p, clone_flags); + return p; bad_fork_free_pid: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index eeb27e27dce3d9..7c122faa05c5ba 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -33,6 +33,10 @@ #include #include #include +#include + +#define CREATE_TRACE_POINTS +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -55,6 +59,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) spin_lock_irq(&sighand->siglock); if (current->signal->oom_score_adj == old_val) current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); } @@ -74,6 +79,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; current->signal->oom_score_adj = new_val; + trace_oom_score_adj_update(current); spin_unlock_irq(&sighand->siglock); return old_val; From c3993076f842de3754360e5b998d6657a9d30303 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:10 -0800 Subject: [PATCH 033/124] mm: page_alloc: generalize order handling in __free_pages_bootmem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit __free_pages_bootmem() used to special-case higher-order frees to save individual page checking with free_pages_bulk(). Nowadays, both zero order and non-zero order frees use free_pages(), which checks each individual page anyway, and so there is little point in making the distinction anymore. The higher-order loop will work just fine for zero order pages. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 33 ++++++++++++--------------------- 1 file changed, 12 insertions(+), 21 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59153da58c69bc..794e6715c22682 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -730,32 +730,23 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * permit the bootmem allocator to evade page validation on high-order frees - */ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) { - if (order == 0) { - __ClearPageReserved(page); - set_page_count(page, 0); - set_page_refcounted(page); - __free_page(page); - } else { - int loop; - - prefetchw(page); - for (loop = 0; loop < (1 << order); loop++) { - struct page *p = &page[loop]; + unsigned int nr_pages = 1 << order; + unsigned int loop; - if (loop + 1 < (1 << order)) - prefetchw(p + 1); - __ClearPageReserved(p); - set_page_count(p, 0); - } + prefetchw(page); + for (loop = 0; loop < nr_pages; loop++) { + struct page *p = &page[loop]; - set_page_refcounted(page); - __free_pages(page, order); + if (loop + 1 < nr_pages) + prefetchw(p + 1); + __ClearPageReserved(p); + set_page_count(p, 0); } + + set_page_refcounted(page); + __free_pages(page, order); } From 560a036b3a3733e33424385c0a0c799dee454d05 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:13 -0800 Subject: [PATCH 034/124] mm: bootmem: drop superfluous range check when freeing pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The area node_bootmem_map represents is aligned to BITS_PER_LONG, and all bits in any aligned word of that map valid. When the represented area extends beyond the end of the node, the non-existant pages will be marked as reserved. As a result, when freeing a page block, doing an explicit range check for whether that block is within the node's range is redundant as the bitmap is consulted anyway to see whether all pages in the block are unreserved. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 3e6f152f117e87..1aea171539acfa 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -197,7 +197,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - if (aligned && vec == ~0UL && start + BITS_PER_LONG <= end) { + if (aligned && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); From 799f933a82d878d7f15215473c5561ce984ada75 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:08:15 -0800 Subject: [PATCH 035/124] mm: bootmem: try harder to free pages in bulk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The loop that frees pages to the page allocator while bootstrapping tries to free higher-order blocks only when the starting address is aligned to that block size. Otherwise it will free all pages on that node one-by-one. Change it to free individual pages up to the first aligned block and then try higher-order frees from there. Signed-off-by: Johannes Weiner Cc: Uwe Kleine-König Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/mm/bootmem.c b/mm/bootmem.c index 1aea171539acfa..668e94df8cf23a 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -171,7 +171,6 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { - int aligned; struct page *page; unsigned long start, end, pages, count = 0; @@ -181,14 +180,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) start = bdata->node_min_pfn; end = bdata->node_low_pfn; - /* - * If the start is aligned to the machines wordsize, we might - * be able to free pages in bulks of that order. - */ - aligned = !(start & (BITS_PER_LONG - 1)); - - bdebug("nid=%td start=%lx end=%lx aligned=%d\n", - bdata - bootmem_node_data, start, end, aligned); + bdebug("nid=%td start=%lx end=%lx\n", + bdata - bootmem_node_data, start, end); while (start < end) { unsigned long *map, idx, vec; @@ -196,12 +189,17 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; vec = ~map[idx / BITS_PER_LONG]; - - if (aligned && vec == ~0UL) { + /* + * If we have a properly aligned and fully unreserved + * BITS_PER_LONG block of pages in front of us, free + * it in one go. + */ + if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) { int order = ilog2(BITS_PER_LONG); __free_pages_bootmem(pfn_to_page(start), order); count += BITS_PER_LONG; + start += BITS_PER_LONG; } else { unsigned long off = 0; @@ -214,8 +212,8 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) vec >>= 1; off++; } + start = ALIGN(start + 1, BITS_PER_LONG); } - start += BITS_PER_LONG; } page = virt_to_page(bdata->node_bootmem_map); From 86cfd3a45042ab242d47f3935a02811a402beab6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2012 15:08:18 -0800 Subject: [PATCH 036/124] mm/vmscan.c: consider swap space when deciding whether to continue reclaim It's pointless to continue reclaiming when we have no swap space and lots of anon pages in the inactive list. Without this patch, it is possible when swap is disabled to continue trying to reclaim when there are only anonymous pages in the system even though that will not make any progress. Signed-off-by: Minchan Kim Cc: KOSAKI Motohiro Acked-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 974162c80edb3d..b935e6f0d69518 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2000,8 +2000,9 @@ static inline bool should_continue_reclaim(struct zone *zone, * inactive lists are large enough, continue reclaiming */ pages_for_compaction = (2UL << sc->order); - inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (nr_swap_pages > 0) + inactive_lru_pages += zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); if (sc->nr_reclaimed < pages_for_compaction && inactive_lru_pages > pages_for_compaction) return true; From 0c176d52b0b2619f231b2bbf329b90c028134f58 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:19 -0800 Subject: [PATCH 037/124] mm: hugetlb: fix pgoff computation when unmapping page from vma The computation for pgoff is incorrect, at least with (vma->vm_pgoff >> PAGE_SHIFT) involved. It is fixed with the available method if HPAGE_SIZE is concerned in page cache lookup. [akpm@linux-foundation.org: use vma_hugecache_offset() directly, per Michal] Signed-off-by: Hillf Danton Cc: Mel Gorman Cc: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ab89d6f382d127..bb7dc405634ff3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2315,8 +2315,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from page cache lookup which is in HPAGE_SIZE units. */ address = address & huge_page_mask(h); - pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) - + (vma->vm_pgoff >> PAGE_SHIFT); + pgoff = vma_hugecache_offset(h, vma, address); mapping = (struct address_space *)page_private(page); /* From fcfb4dcc9698f932836aa63ba0d82e7dbd300fb3 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Jan 2012 15:08:21 -0800 Subject: [PATCH 038/124] mm/mempolicy.c: mpol_equal(): use bool mpol_equal() logically returns a boolean. Use a bool type to slightly improve readability. Signed-off-by: KOSAKI Motohiro Cc: Stephen Wilson Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 10 +++++----- mm/mempolicy.c | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 7978eec1b7d996..7c727a90d70da6 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -164,11 +164,11 @@ static inline void mpol_get(struct mempolicy *pol) atomic_inc(&pol->refcnt); } -extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b); -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) - return 1; + return true; return __mpol_equal(a, b); } @@ -257,9 +257,9 @@ static inline int vma_migratable(struct vm_area_struct *vma) struct mempolicy {}; -static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b) +static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { - return 1; + return true; } static inline void mpol_put(struct mempolicy *p) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c3fdbcb17658ce..e3d58f088466c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, } /* Slow path of a mempolicy comparison */ -int __mpol_equal(struct mempolicy *a, struct mempolicy *b) +bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) - return 0; + return false; if (a->mode != b->mode) - return 0; + return false; if (a->flags != b->flags) - return 0; + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) - return 0; + return false; switch (a->mode) { case MPOL_BIND: /* Fall through */ case MPOL_INTERLEAVE: - return nodes_equal(a->v.nodes, b->v.nodes); + return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: return a->v.preferred_node == b->v.preferred_node; default: BUG(); - return 0; + return false; } } From 564c81db19f3630f53a14bbceb7b85eb9660ded3 Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 10 Jan 2012 15:08:22 -0800 Subject: [PATCH 039/124] mm/migrate.c: cleanup comment for migration_entry_wait() migration_entry_wait() can also be called from hugetlb_fault() now. Remove the incorrect comment. Signed-off-by: Wang Sheng-Hui Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 594dc375d0f969..670bb8911cd196 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -181,8 +181,6 @@ static void remove_migration_ptes(struct page *old, struct page *new) * Something used the pte of a page under migration. We need to * get to the page and wait until migration is finished. * When we return from this function the fault will be retried. - * - * This function is called from do_swap_page(). */ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) From 5b990546e33477c34ee6fbc20fad6584386b46c3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:23 -0800 Subject: [PATCH 040/124] mempool: fix and document synchronization and memory barrier usage mempool_alloc/free() use undocumented smp_mb()'s. The code is slightly broken and misleading. The lockless part is in mempool_free(). It wants to determine whether the item being freed needs to be returned to the pool or backing allocator without grabbing pool->lock. Two things need to be guaranteed for correct operation. 1. pool->curr_nr + #allocated should never dip below pool->min_nr. 2. Waiters shouldn't be left dangling. For #1, The only necessary condition is that curr_nr visible at free is from after the allocation of the element being freed (details in the comment). For most cases, this is true without any barrier but there can be fringe cases where the allocated pointer is passed to the freeing task without going through memory barriers. To cover this case, wmb is necessary before returning from allocation and rmb is necessary before reading curr_nr. IOW, ALLOCATING TASK FREEING TASK update pool state after alloc; wmb(); pass pointer to freeing task; read pointer; rmb(); read pool state to free; The current code doesn't have wmb after pool update during allocation and may theoretically, on machines where unlock doesn't behave as full wmb, lead to pool depletion and deadlock. smp_wmb() needs to be added after successful allocation from reserved elements and smp_mb() in mempool_free() can be replaced with smp_rmb(). For #2, the waiter needs to add itself to waitqueue and then check the wait condition and the waker needs to update the wait condition and then wake up. Because waitqueue operations always go through full spinlock synchronization, there is no need for extra memory barriers. Furthermore, mempool_alloc() is already holding pool->lock when it decides that it needs to wait. There is no reason to do unlock - add waitqueue - test condition again. It can simply add itself to waitqueue while holding pool->lock and then unlock and sleep. This patch adds smp_wmb() after successful allocation from reserved pool, replaces smp_mb() in mempool_free() with smp_rmb() and extend pool->lock over waitqueue addition. More importantly, it explains what memory barriers do and how the lockless testing is correct. -v2: Oleg pointed out that unlock doesn't imply wmb. Added explicit smp_wmb() after successful allocation from reserved pool and updated comments accordingly. Signed-off-by: Tejun Heo Cc: Oleg Nesterov Cc: "Paul E. McKenney" Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 61 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index e73641b79bb5f0..11f0d0a5e0f853 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -224,28 +224,31 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) if (likely(pool->curr_nr)) { element = remove_element(pool); spin_unlock_irqrestore(&pool->lock, flags); + /* paired with rmb in mempool_free(), read comment there */ + smp_wmb(); return element; } - spin_unlock_irqrestore(&pool->lock, flags); /* We must not sleep in the GFP_ATOMIC case */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_WAIT)) { + spin_unlock_irqrestore(&pool->lock, flags); return NULL; + } - /* Now start performing page reclaim */ + /* Let's wait for someone else to return an element to @pool */ gfp_temp = gfp_mask; init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); - smp_mb(); - if (!pool->curr_nr) { - /* - * FIXME: this should be io_schedule(). The timeout is there - * as a workaround for some DM problems in 2.6.18. - */ - io_schedule_timeout(5*HZ); - } - finish_wait(&pool->wait, &wait); + spin_unlock_irqrestore(&pool->lock, flags); + + /* + * FIXME: this should be io_schedule(). The timeout is there as a + * workaround for some DM problems in 2.6.18. + */ + io_schedule_timeout(5*HZ); + + finish_wait(&pool->wait, &wait); goto repeat_alloc; } EXPORT_SYMBOL(mempool_alloc); @@ -265,7 +268,39 @@ void mempool_free(void *element, mempool_t *pool) if (unlikely(element == NULL)) return; - smp_mb(); + /* + * Paired with the wmb in mempool_alloc(). The preceding read is + * for @element and the following @pool->curr_nr. This ensures + * that the visible value of @pool->curr_nr is from after the + * allocation of @element. This is necessary for fringe cases + * where @element was passed to this task without going through + * barriers. + * + * For example, assume @p is %NULL at the beginning and one task + * performs "p = mempool_alloc(...);" while another task is doing + * "while (!p) cpu_relax(); mempool_free(p, ...);". This function + * may end up using curr_nr value which is from before allocation + * of @p without the following rmb. + */ + smp_rmb(); + + /* + * For correctness, we need a test which is guaranteed to trigger + * if curr_nr + #allocated == min_nr. Testing curr_nr < min_nr + * without locking achieves that and refilling as soon as possible + * is desirable. + * + * Because curr_nr visible here is always a value after the + * allocation of @element, any task which decremented curr_nr below + * min_nr is guaranteed to see curr_nr < min_nr unless curr_nr gets + * incremented to min_nr afterwards. If curr_nr gets incremented + * to min_nr after the allocation of @element, the elements + * allocated after that are subject to the same guarantee. + * + * Waiters happen iff curr_nr is 0 and the above guarantee also + * ensures that there will be frees which return elements to the + * pool waking up the waiters. + */ if (pool->curr_nr < pool->min_nr) { spin_lock_irqsave(&pool->lock, flags); if (pool->curr_nr < pool->min_nr) { From 0565d317768cc66b13e37184f29d9f270c2886dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:26 -0800 Subject: [PATCH 041/124] mempool: drop unnecessary and incorrect BUG_ON() from mempool_destroy() mempool_destroy() is a thin wrapper around free_pool(). The only thing it adds is BUG_ON(pool->curr_nr != pool->min_nr). The intention seems to be to enforce that all allocated elements are freed; however, the BUG_ON() can't achieve that (it doesn't know anything about objects above min_nr) and incorrect as mempool_resize() is allowed to leave the pool extended but not filled. Furthermore, panicking is way worse than any memory leak and there are better debug tools to track memory leaks. Drop the BUG_ON() from mempool_destory() and as that leaves the function identical to free_pool(), replace it. Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index 11f0d0a5e0f853..e3a802a0cea096 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -27,7 +27,15 @@ static void *remove_element(mempool_t *pool) return pool->elements[--pool->curr_nr]; } -static void free_pool(mempool_t *pool) +/** + * mempool_destroy - deallocate a memory pool + * @pool: pointer to the memory pool which was allocated via + * mempool_create(). + * + * Free all reserved elements in @pool and @pool itself. This function + * only sleeps if the free_fn() function sleeps. + */ +void mempool_destroy(mempool_t *pool) { while (pool->curr_nr) { void *element = remove_element(pool); @@ -36,6 +44,7 @@ static void free_pool(mempool_t *pool) kfree(pool->elements); kfree(pool); } +EXPORT_SYMBOL(mempool_destroy); /** * mempool_create - create a memory pool @@ -86,7 +95,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, element = pool->alloc(GFP_KERNEL, pool->pool_data); if (unlikely(!element)) { - free_pool(pool); + mempool_destroy(pool); return NULL; } add_element(pool, element); @@ -171,23 +180,6 @@ int mempool_resize(mempool_t *pool, int new_min_nr, gfp_t gfp_mask) } EXPORT_SYMBOL(mempool_resize); -/** - * mempool_destroy - deallocate a memory pool - * @pool: pointer to the memory pool which was allocated via - * mempool_create(). - * - * this function only sleeps if the free_fn() function sleeps. The caller - * has to guarantee that all elements have been returned to the pool (ie: - * freed) prior to calling mempool_destroy(). - */ -void mempool_destroy(mempool_t *pool) -{ - /* Check for outstanding elements */ - BUG_ON(pool->curr_nr != pool->min_nr); - free_pool(pool); -} -EXPORT_SYMBOL(mempool_destroy); - /** * mempool_alloc - allocate an element from a specific memory pool * @pool: pointer to the memory pool which was allocated via From 1ebb7044c9142c67d1d2b04d84010b4810a43fd8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:08:28 -0800 Subject: [PATCH 042/124] mempool: fix first round failure behavior mempool modifies gfp_mask so that the backing allocator doesn't try too hard or trigger warning message when there's pool to fall back on. In addition, for the first try, it removes __GFP_WAIT and IO, so that it doesn't trigger reclaim or wait when allocation can be fulfilled from pool; however, when that allocation fails and pool is empty too, it waits for the pool to be replenished before retrying. Allocation which could have succeeded after a bit of reclaim has to wait on the reserved items and it's not like mempool doesn't retry with __GFP_WAIT and IO. It just does that *after* someone returns an element, pointlessly delaying things. Fix it by retrying immediately if the first round of allocation attempts w/o __GFP_WAIT and IO fails. [akpm@linux-foundation.org: shorten the lock hold time] Signed-off-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/mempool.c b/mm/mempool.c index e3a802a0cea096..d9049811f3521b 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -221,14 +221,23 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) return element; } - /* We must not sleep in the GFP_ATOMIC case */ + /* + * We use gfp mask w/o __GFP_WAIT or IO for the first round. If + * alloc failed with that and @pool was empty, retry immediately. + */ + if (gfp_temp != gfp_mask) { + spin_unlock_irqrestore(&pool->lock, flags); + gfp_temp = gfp_mask; + goto repeat_alloc; + } + + /* We must not sleep if !__GFP_WAIT */ if (!(gfp_mask & __GFP_WAIT)) { spin_unlock_irqrestore(&pool->lock, flags); return NULL; } /* Let's wait for someone else to return an element to @pool */ - gfp_temp = gfp_mask; init_wait(&wait); prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE); From ea5768c74b8e0d6a866508fc6399d5ff958da5e3 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:30 -0800 Subject: [PATCH 043/124] mm/hugetlb.c: avoid bogus counter of surplus huge page If we have to hand back the newly allocated huge page to page allocator, for any reason, the changed counter should be recovered. This affects only s390 at present. Signed-off-by: Hillf Danton Reviewed-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bb7dc405634ff3..ea8c3a4cd2ae8a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -800,7 +800,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); - return NULL; + page = NULL; } spin_lock(&hugetlb_lock); From faed836a2371a96901057f310e436a09eded94fd Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Tue, 10 Jan 2012 15:08:32 -0800 Subject: [PATCH 044/124] mm/migrate.c: remove the unused macro lru_to_page lru_to_page is not used in mm/migrate.c. Signed-off-by: Wang Sheng-Hui Acked-by: Mel Gorman Acked-by: Kyungmin Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 670bb8911cd196..89ea0854332ec3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -39,8 +39,6 @@ #include "internal.h" -#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) - /* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is From 043bcbe5ec51e0478ef2b44acef17193e01d7f70 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 10 Jan 2012 15:08:33 -0800 Subject: [PATCH 045/124] mm: test PageSwapBacked in lumpy reclaim Lumpy reclaim does well to stop at a PageAnon when there's no swap, but better is to stop at any PageSwapBacked, which includes shmem/tmpfs too. Signed-off-by: Hugh Dickins Reviewed-by: KOSAKI Motohiro Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index b935e6f0d69518..8a4e767fb8ac15 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1166,7 +1166,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, * anon page which don't already have a swap slot is * pointless. */ - if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + if (nr_swap_pages <= 0 && PageSwapBacked(cursor_page) && !PageSwapCache(cursor_page)) break; From 3770490ec82ca63d5fdcebeb95f2f68af2626357 Mon Sep 17 00:00:00 2001 From: Hillf Danton Date: Tue, 10 Jan 2012 15:08:36 -0800 Subject: [PATCH 046/124] mm: vmscan: fix typo in isolating lru pages It is not the tag page but the cursor page that we should process, and it looks a typo. Signed-off-by: Hillf Danton Cc: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: David Rientjes Cc: Hugh Dickins Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8a4e767fb8ac15..26f4a8a4e0c75c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1173,7 +1173,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken += hpage_nr_pages(page); + nr_taken += hpage_nr_pages(cursor_page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; From db1aecafef58b5dda39c4228debe2c845e4a27ab Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2012 15:08:39 -0800 Subject: [PATCH 047/124] mm/vmalloc.c: change void* into explict vm_struct* vmap_area->private is void* but we don't use the field for various purpose but use only for vm_struct. So change it to a vm_struct* with naming to improve for readability and type checking. Signed-off-by: Minchan Kim Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 21fdf46ad5aac7..877ca046f43d02 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -256,7 +256,7 @@ struct vmap_area { struct rb_node rb_node; /* address sorted rbtree */ struct list_head list; /* address sorted list */ struct list_head purge_list; /* "lazy purge" list */ - void *private; + struct vm_struct *vm; struct rcu_head rcu_head; }; @@ -1285,7 +1285,7 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; - va->private = vm; + va->vm = vm; va->flags |= VM_VM_AREA; } @@ -1408,7 +1408,7 @@ static struct vm_struct *find_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) - return va->private; + return va->vm; return NULL; } @@ -1427,7 +1427,7 @@ struct vm_struct *remove_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { - struct vm_struct *vm = va->private; + struct vm_struct *vm = va->vm; if (!(vm->flags & VM_UNLIST)) { struct vm_struct *tmp, **p; From ed128fea3bcbce728c9c81b2e45ec3921911bfb6 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 10 Jan 2012 15:08:41 -0800 Subject: [PATCH 048/124] get_maintainers.pl: follow renames when looking up commit signers I happen to have had a commit to various network drivers since the big renaming/reorg which happened to drivers/net recently. This means that I now appear to be in the top few commit signers (by %age) for many of them so am getting sent all sorts of stuff and people who are involved with the driver are not. e.g. (to pick one at random): $ ./scripts/get_maintainer.pl -f drivers/net/ethernet/nvidia/forcedeth.c "David S. Miller" (commit_signer:5/7=71%) Ian Campbell (commit_signer:2/7=29%) Eric Dumazet (commit_signer:1/7=14%) Jeff Kirsher (commit_signer:1/7=14%) Jiri Pirko (commit_signer:1/7=14%) netdev@vger.kernel.org (open list:NETWORKING DRIVERS) linux-kernel@vger.kernel.org (open list) With the following patch the renames are followed and the result appears much more sensible: $ ./scripts/get_maintainer.pl -f drivers/net/ethernet/nvidia/forcedeth.c "David S. Miller" (commit_signer:31/34=91%) Joe Perches (commit_signer:11/34=32%) Szymon Janc (commit_signer:5/34=15%) Jiri Pirko (commit_signer:3/34=9%) Paul (commit_signer:2/34=6%) netdev@vger.kernel.org (open list:NETWORKING DRIVERS) linux-kernel@vger.kernel.org (open list) Signed-off-by: Ian Campbell Acked-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 4594f334105110..f32a04c4c5bc1c 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -95,7 +95,7 @@ "execute_cmd" => \&git_execute_cmd, "available" => '(which("git") ne "") && (-d ".git")', "find_signers_cmd" => - "git log --no-color --since=\$email_git_since " . + "git log --no-color --follow --since=\$email_git_since " . '--format="GitCommit: %H%n' . 'GitAuthor: %an <%ae>%n' . 'GitDate: %aD%n' . From 38f1b4c53826f3ac7e1b17c04a2dcdc802fb0785 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:42 -0800 Subject: [PATCH 049/124] MAINTAINERS: update various arm F: patterns Track renames and missing or deleted files. Signed-off-by: Joe Perches Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 0ae41c9a6c1382..200135616e5cc8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -914,7 +914,6 @@ M: Lennert Buytenhek M: Nicolas Pitre L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Odd Fixes -F: arch/arm/mach-loki/ F: arch/arm/mach-kirkwood/ F: arch/arm/mach-mv78xx0/ F: arch/arm/mach-orion5x/ @@ -1076,8 +1075,8 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-s5pv210/mach-aquila.c F: arch/arm/mach-s5pv210/mach-goni.c -F: arch/arm/mach-exynos4/mach-universal_c210.c -F: arch/arm/mach-exynos4/mach-nuri.c +F: arch/arm/mach-exynos/mach-universal_c210.c +F: arch/arm/mach-exynos/mach-nuri.c ARM/SAMSUNG S5P SERIES FIMC SUPPORT M: Kyungmin Park @@ -1105,7 +1104,6 @@ M: Tomasz Stanislawski L: linux-arm-kernel@lists.infradead.org L: linux-media@vger.kernel.org S: Maintained -F: arch/arm/plat-s5p/dev-tv.c F: drivers/media/video/s5p-tv/ ARM/SHMOBILE ARM ARCHITECTURE @@ -1140,7 +1138,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) W: http://www.mcuos.com S: Maintained F: arch/arm/mach-w90x900/ -F: arch/arm/mach-nuc93x/ F: drivers/input/keyboard/w90p910_keypad.c F: drivers/input/touchscreen/w90p910_ts.c F: drivers/watchdog/nuc900_wdt.c @@ -6180,9 +6177,7 @@ M: Viresh Kumar W: http://www.st.com/spear S: Maintained F: arch/arm/mach-spear*/clock.c -F: arch/arm/mach-spear*/include/mach/clkdev.h F: arch/arm/plat-spear/clock.c -F: arch/arm/plat-spear/include/plat/clkdev.h F: arch/arm/plat-spear/include/plat/clock.h SPEAR PAD MULTIPLEXING SUPPORT From 77278d50e04bfb57076eb50cf8c5f898f933bf84 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:44 -0800 Subject: [PATCH 050/124] MAINTAINERS: update adp gpio F: patterns Commit c103de240439df ("gpio: reorganize drivers") renamed the files, update the patterns. Signed-off-by: Joe Perches Acked-by: Grant Likely Acked-by: Michael Hennerich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 200135616e5cc8..c506575640ea03 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -342,7 +342,7 @@ S: Supported F: drivers/mfd/adp5520.c F: drivers/video/backlight/adp5520_bl.c F: drivers/leds/leds-adp5520.c -F: drivers/gpio/adp5520-gpio.c +F: drivers/gpio/gpio-adp5520.c F: drivers/input/keyboard/adp5520-keys.c ADP5588 QWERTY KEYPAD AND IO EXPANDER DRIVER (ADP5588/ADP5587) @@ -351,7 +351,7 @@ L: device-drivers-devel@blackfin.uclinux.org W: http://wiki.analog.com/ADP5588 S: Supported F: drivers/input/keyboard/adp5588-keys.c -F: drivers/gpio/adp5588-gpio.c +F: drivers/gpio/gpio-adp5588.c ADP8860 BACKLIGHT DRIVER (ADP8860/ADP8861/ADP8863) M: Michael Hennerich From 72dbb7051334c37c9210cd735684c304da8a5e85 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:46 -0800 Subject: [PATCH 051/124] MAINTAINERS: update bt8xx gpio F: patterns Commit c103de240439d ("gpio: reorganize drivers") renamed the file, update the pattern. Signed-off-by: Joe Perches Cc: Grant Likely Cc: Michael Buesch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c506575640ea03..04716db6078fd7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1618,7 +1618,7 @@ BT8XXGPIO DRIVER M: Michael Buesch W: http://bu3sch.de/btgpio.php S: Maintained -F: drivers/gpio/bt8xxgpio.c +F: drivers/gpio/gpio-bt8xx.c BTRFS FILE SYSTEM M: Chris Mason From 25b8d2b4fc4fd9f9ae7f95ce76bc47712c99809e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:49 -0800 Subject: [PATCH 052/124] MAINTAINERS: update marvell ccic F: patterns Commit f8fc729870ee ("[media] marvell-cam: Move cafe-ccic into its own directory") moved the files, update the pattern. Signed-off-by: Joe Perches Cc: Jonathan Corbet Acked-by: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 04716db6078fd7..75da19d890f605 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1659,7 +1659,7 @@ L: linux-media@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-2.6.git S: Maintained F: Documentation/video4linux/cafe_ccic -F: drivers/media/video/cafe_ccic* +F: drivers/media/video/marvell-ccic/ CAIF NETWORK LAYER M: Sjur Braendeland From d8f663561b185101c5b97b55d0f6aad49671d4e3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:51 -0800 Subject: [PATCH 053/124] MAINTAINERS: update mfd F: patterns commit 8959e74399c ("mfd: Delete ab3550 driver") removed the driver, update the patterns. Signed-off-by: Joe Perches Acked-by: Linus Walleij Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 - 1 file changed, 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 75da19d890f605..1d20e3f9dc00b5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1169,7 +1169,6 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers) S: Maintained F: arch/arm/mach-ux500/ F: drivers/dma/ste_dma40* -F: drivers/mfd/ab3550* F: drivers/mfd/abx500* F: drivers/mfd/ab8500* F: drivers/mfd/stmpe* From d4a45787afd22316dc9ee9129a58796100621cb5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:54 -0800 Subject: [PATCH 054/124] MAINTAINERS: update sdhci F: patterns commit 38576af1f8c ("mmc: sdhci: make sdhci-of device drivers self registered") moved the files around. Update the patterns. Signed-off-by: Joe Perches Cc: Shawn Guo Cc: Chris Ball Acked-by: Anton Vorontsov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 1d20e3f9dc00b5..f72a3efd8dbb54 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5791,13 +5791,14 @@ L: linux-mmc@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/cjb/mmc.git S: Maintained F: drivers/mmc/host/sdhci.* +F: drivers/mmc/host/sdhci-pltfm.[ch] SECURE DIGITAL HOST CONTROLLER INTERFACE, OPEN FIRMWARE BINDINGS (SDHCI-OF) M: Anton Vorontsov L: linuxppc-dev@lists.ozlabs.org L: linux-mmc@vger.kernel.org S: Maintained -F: drivers/mmc/host/sdhci-of.* +F: drivers/mmc/host/sdhci-pltfm.[ch] SECURE DIGITAL HOST CONTROLLER INTERFACE (SDHCI) SAMSUNG DRIVER M: Ben Dooks From 0f04e2aa0cbe10f06326cd7f98aaf0012d9c6038 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:56 -0800 Subject: [PATCH 055/124] MAINTAINERS: update tulip F: patterns commit a88394cfb58 ("ewrk3/tulip: Move the DEC - Tulip drivers") moved the files, update the patterns. Signed-off-by: Joe Perches Acked-by: Grant Grundler Cc: Jeff Kirsher Cc: Tobias Ringstrom Cc: Grant Grundler Cc: David Davies Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index f72a3efd8dbb54..f952e003eb9c6a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2096,7 +2096,7 @@ DAVICOM FAST ETHERNET (DMFE) NETWORK DRIVER L: netdev@vger.kernel.org S: Orphan F: Documentation/networking/dmfe.txt -F: drivers/net/ethernet/tulip/dmfe.c +F: drivers/net/ethernet/dec/tulip/dmfe.c DC390/AM53C974 SCSI driver M: Kurt Garloff @@ -6640,7 +6640,7 @@ TULIP NETWORK DRIVERS M: Grant Grundler L: netdev@vger.kernel.org S: Maintained -F: drivers/net/ethernet/tulip/ +F: drivers/net/ethernet/dec/tulip/ TUN/TAP driver M: Maxim Krasnyansky From a31a96ad7206df554f1d1571b986abbe742d8b8e Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:08:58 -0800 Subject: [PATCH 056/124] MAINTAINERS: update greth F: patterns commit 1fe003fd424 ("greth: Move the Aeroflex Gaisler driver") moved the files, update the patterns. Signed-off-by: Joe Perches Cc: Kristoffer Glembo Cc: Jeff Kirsher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index f952e003eb9c6a..abb632e25f13c0 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2906,7 +2906,7 @@ GRETH 10/100/1G Ethernet MAC device driver M: Kristoffer Glembo L: netdev@vger.kernel.org S: Maintained -F: drivers/net/greth* +F: drivers/net/ethernet/aeroflex/ GSPCA FINEPIX SUBDRIVER M: Frank Zago From 19c90aa678a166381609af574d2a993568f5f5bb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:00 -0800 Subject: [PATCH 057/124] MAINTAINERS: update encrypted-keys F: patterns commit 61cf45d0199 ("encrypted-keys: create encrypted-keys directory") moved the files, update the patterns. Signed-off-by: Joe Perches Cc: Mimi Zohar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index abb632e25f13c0..dbf1676fe3c6dc 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3856,8 +3856,7 @@ L: keyrings@linux-nfs.org S: Supported F: Documentation/security/keys-trusted-encrypted.txt F: include/keys/encrypted-type.h -F: security/keys/encrypted.c -F: security/keys/encrypted.h +F: security/keys/encrypted-keys/ KGDB / KDB /debug_core M: Jason Wessel From b2b0186d0f98e7fc2c21a2a3514223ab248e46f6 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:01 -0800 Subject: [PATCH 058/124] MAINTAINERS: staging: media: update F: patterns commit 4860c73804c ("staging: Move media drivers to staging/media") moved the files, update the F: patterns. Signed-off-by: Joe Perches Acked-by: Mauro Carvalho Chehab Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dbf1676fe3c6dc..43083178cee875 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6300,7 +6300,7 @@ STAGING - LIRC (LINUX INFRARED REMOTE CONTROL) DRIVERS M: Jarod Wilson W: http://www.lirc.org/ S: Odd Fixes -F: drivers/staging/lirc/ +F: drivers/staging/media/lirc/ STAGING - NVIDIA COMPLIANT EMBEDDED CONTROLLER INTERFACE (nvec) M: Julian Andres Klode @@ -6336,7 +6336,7 @@ F: drivers/staging/sm7xx/ STAGING - SOFTLOGIC 6x10 MPEG CODEC M: Ben Collins S: Odd Fixes -F: drivers/staging/solo6x10/ +F: drivers/staging/media/solo6x10/ STAGING - SPEAKUP CONSOLE SPEECH DRIVER M: William Hubbs From 8460241e4477db699135ad0521e1293258a8baaa Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:04 -0800 Subject: [PATCH 059/124] MAINTAINERS: serial:blackfin: update F: pattern commit 0c6967b5a0 ("serial:blackfin: rename Blackfin serial driver to bfin_uart.c") renamed the file, update the pattern. Signed-off-by: Joe Perches Acked-by: Sonic Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 43083178cee875..7510b633922239 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1487,7 +1487,7 @@ M: Sonic Zhang L: uclinux-dist-devel@blackfin.uclinux.org W: http://blackfin.uclinux.org S: Supported -F: drivers/tty/serial/bfin_5xx.c +F: drivers/tty/serial/bfin_uart.c BLACKFIN WATCHDOG DRIVER M: Mike Frysinger From 9df92e6c770e2709f6e3080a4a82d71953267f05 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:06 -0800 Subject: [PATCH 060/124] MAINTAINERS: spi: update F: patterns commit ca632f55669 ("spi: reorganize drivers") renamed the files, update the F: patterns. Signed-off-by: Joe Perches Acked-by: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7510b633922239..e4eeb9b856a22a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1144,7 +1144,7 @@ F: drivers/watchdog/nuc900_wdt.c F: drivers/net/ethernet/nuvoton/w90p910_ether.c F: drivers/mtd/nand/nuc900_nand.c F: drivers/rtc/rtc-nuc900.c -F: drivers/spi/spi_nuc900.c +F: drivers/spi/spi-nuc900.c F: drivers/usb/host/ehci-w90x900.c F: drivers/video/nuc900fb.c @@ -1348,7 +1348,7 @@ F: drivers/net/ethernet/cadence/ ATMEL SPI DRIVER M: Nicolas Ferre S: Supported -F: drivers/spi/atmel_spi.* +F: drivers/spi/spi-atmel.* ATMEL USBA UDC DRIVER M: Nicolas Ferre @@ -5308,7 +5308,7 @@ T: git git://git.linaro.org/people/ycmiao/pxa-linux.git S: Maintained F: arch/arm/mach-pxa/ F: drivers/pcmcia/pxa2xx* -F: drivers/spi/pxa2xx* +F: drivers/spi/spi-pxa2xx* F: drivers/usb/gadget/pxa2* F: include/sound/pxa2xx-lib.h F: sound/arm/pxa* From 89d07767d051c9713b4d79c387c1eadd085c30f8 Mon Sep 17 00:00:00 2001 From: Kyungmin Park Date: Tue, 10 Jan 2012 15:09:09 -0800 Subject: [PATCH 061/124] devfreq: add devfreq maintainer entry As devfreq is merged at mainline. Also update the maintainer entry. Signed-off-by: Kyungmin Park Cc: Kevin Hilman Cc: MyungJoo Ham Acked-by: Rafael J. Wysocki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index e4eeb9b856a22a..cf6b2d8ff2ab1f 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2169,6 +2169,13 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/balbi/usb.git S: Maintained F: drivers/usb/dwc3/ +DEVICE FREQUENCY (DEVFREQ) +M: MyungJoo Ham +M: Kyungmin Park +L: linux-kernel@vger.kernel.org +S: Maintained +F: drivers/devfreq/ + DEVICE NUMBER REGISTRY M: Torben Mathiasen W: http://lanana.org/docs/device-list/index.html From 3ed0c15fd1032c6a75aba804a200d4acc5aeb72e Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 10 Jan 2012 15:09:10 -0800 Subject: [PATCH 062/124] backlight: remove ADX backlight device support Support for the Avionic Design Xanthos backlight device got added in commit 3b96ea9ef8 ("backlight: Add support for the Avionic Design Xanthos backlight device."). That support depends on ARCH_PXA_ADX. The code that should have provided that Kconfig symbol never got submitted. It has never been possible to even build this driver. Remove it. Signed-off-by: Paul Bolle Acked-by: Thierry Reding Cc: Richard Purdie Cc: Wim Van Sebroeck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/Kconfig | 8 -- drivers/video/backlight/Makefile | 1 - drivers/video/backlight/adx_bl.c | 182 ------------------------------- 3 files changed, 191 deletions(-) delete mode 100644 drivers/video/backlight/adx_bl.c diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 278aeaa9250596..681b36929fe406 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -280,14 +280,6 @@ config BACKLIGHT_WM831X If you have a backlight driven by the ISINK and DCDC of a WM831x PMIC say y to enable the backlight driver for it. -config BACKLIGHT_ADX - tristate "Avionic Design Xanthos Backlight Driver" - depends on ARCH_PXA_ADX - default y - help - Say Y to enable the backlight driver on Avionic Design Xanthos-based - boards. - config BACKLIGHT_ADP5520 tristate "Backlight Driver for ADP5520/ADP5501 using WLED" depends on PMIC_ADP5520 diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index fdd1fc4b277062..af5cf654ec7c5c 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -32,7 +32,6 @@ obj-$(CONFIG_BACKLIGHT_APPLE) += apple_bl.o obj-$(CONFIG_BACKLIGHT_TOSA) += tosa_bl.o obj-$(CONFIG_BACKLIGHT_SAHARA) += kb3886_bl.o obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o -obj-$(CONFIG_BACKLIGHT_ADX) += adx_bl.o obj-$(CONFIG_BACKLIGHT_ADP5520) += adp5520_bl.o obj-$(CONFIG_BACKLIGHT_ADP8860) += adp8860_bl.o obj-$(CONFIG_BACKLIGHT_ADP8870) += adp8870_bl.o diff --git a/drivers/video/backlight/adx_bl.c b/drivers/video/backlight/adx_bl.c deleted file mode 100644 index c861c41af44233..00000000000000 --- a/drivers/video/backlight/adx_bl.c +++ /dev/null @@ -1,182 +0,0 @@ -/* - * linux/drivers/video/backlight/adx.c - * - * Copyright (C) 2009 Avionic Design GmbH - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * Written by Thierry Reding - */ - -#include -#include -#include -#include -#include -#include - -/* register definitions */ -#define ADX_BACKLIGHT_CONTROL 0x00 -#define ADX_BACKLIGHT_CONTROL_ENABLE (1 << 0) -#define ADX_BACKLIGHT_BRIGHTNESS 0x08 -#define ADX_BACKLIGHT_STATUS 0x10 -#define ADX_BACKLIGHT_ERROR 0x18 - -struct adxbl { - void __iomem *base; -}; - -static int adx_backlight_update_status(struct backlight_device *bldev) -{ - struct adxbl *bl = bl_get_data(bldev); - u32 value; - - value = bldev->props.brightness; - writel(value, bl->base + ADX_BACKLIGHT_BRIGHTNESS); - - value = readl(bl->base + ADX_BACKLIGHT_CONTROL); - - if (bldev->props.state & BL_CORE_FBBLANK) - value &= ~ADX_BACKLIGHT_CONTROL_ENABLE; - else - value |= ADX_BACKLIGHT_CONTROL_ENABLE; - - writel(value, bl->base + ADX_BACKLIGHT_CONTROL); - - return 0; -} - -static int adx_backlight_get_brightness(struct backlight_device *bldev) -{ - struct adxbl *bl = bl_get_data(bldev); - u32 brightness; - - brightness = readl(bl->base + ADX_BACKLIGHT_BRIGHTNESS); - return brightness & 0xff; -} - -static int adx_backlight_check_fb(struct backlight_device *bldev, struct fb_info *fb) -{ - return 1; -} - -static const struct backlight_ops adx_backlight_ops = { - .options = 0, - .update_status = adx_backlight_update_status, - .get_brightness = adx_backlight_get_brightness, - .check_fb = adx_backlight_check_fb, -}; - -static int __devinit adx_backlight_probe(struct platform_device *pdev) -{ - struct backlight_properties props; - struct backlight_device *bldev; - struct resource *res; - struct adxbl *bl; - int ret = 0; - - res = platform_get_resource(pdev, IORESOURCE_MEM, 0); - if (!res) { - ret = -ENXIO; - goto out; - } - - res = devm_request_mem_region(&pdev->dev, res->start, - resource_size(res), res->name); - if (!res) { - ret = -ENXIO; - goto out; - } - - bl = devm_kzalloc(&pdev->dev, sizeof(*bl), GFP_KERNEL); - if (!bl) { - ret = -ENOMEM; - goto out; - } - - bl->base = devm_ioremap_nocache(&pdev->dev, res->start, - resource_size(res)); - if (!bl->base) { - ret = -ENXIO; - goto out; - } - - memset(&props, 0, sizeof(struct backlight_properties)); - props.type = BACKLIGHT_RAW; - props.max_brightness = 0xff; - bldev = backlight_device_register(dev_name(&pdev->dev), &pdev->dev, - bl, &adx_backlight_ops, &props); - if (IS_ERR(bldev)) { - ret = PTR_ERR(bldev); - goto out; - } - - bldev->props.brightness = 0xff; - bldev->props.power = FB_BLANK_UNBLANK; - - platform_set_drvdata(pdev, bldev); - -out: - return ret; -} - -static int __devexit adx_backlight_remove(struct platform_device *pdev) -{ - struct backlight_device *bldev; - int ret = 0; - - bldev = platform_get_drvdata(pdev); - bldev->props.power = FB_BLANK_UNBLANK; - bldev->props.brightness = 0xff; - backlight_update_status(bldev); - backlight_device_unregister(bldev); - platform_set_drvdata(pdev, NULL); - - return ret; -} - -#ifdef CONFIG_PM -static int adx_backlight_suspend(struct platform_device *pdev, - pm_message_t state) -{ - return 0; -} - -static int adx_backlight_resume(struct platform_device *pdev) -{ - return 0; -} -#else -#define adx_backlight_suspend NULL -#define adx_backlight_resume NULL -#endif - -static struct platform_driver adx_backlight_driver = { - .probe = adx_backlight_probe, - .remove = __devexit_p(adx_backlight_remove), - .suspend = adx_backlight_suspend, - .resume = adx_backlight_resume, - .driver = { - .name = "adx-backlight", - .owner = THIS_MODULE, - }, -}; - -static int __init adx_backlight_init(void) -{ - return platform_driver_register(&adx_backlight_driver); -} - -static void __exit adx_backlight_exit(void) -{ - platform_driver_unregister(&adx_backlight_driver); -} - -module_init(adx_backlight_init); -module_exit(adx_backlight_exit); - -MODULE_AUTHOR("Thierry Reding "); -MODULE_DESCRIPTION("Avionic Design Xanthos Backlight Driver"); -MODULE_LICENSE("GPL v2"); From 81178e021689bf86c328f144aa0f0e1b50f5e94c Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:11 -0800 Subject: [PATCH 063/124] backlight: convert drivers/video/backlight/* to use module_platform_driver() Convert the drivers in drivers/video/backlight/* to use the module_platform_driver() macro which makes the code smaller and a bit simpler. Signed-off-by: Axel Lin Acked-by: Haojian Zhuang Acked-by: H Hartley Sweeten [ep93xx_bl.c] Cc: Mike Rapoport Cc: Richard Purdie Acked-by: Michael Hennerich Acked-by: Mark Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/88pm860x_bl.c | 12 +----------- drivers/video/backlight/adp5520_bl.c | 12 +----------- drivers/video/backlight/da903x_bl.c | 12 +----------- drivers/video/backlight/ep93xx_bl.c | 12 +----------- drivers/video/backlight/generic_bl.c | 13 +------------ drivers/video/backlight/jornada720_bl.c | 13 +------------ drivers/video/backlight/jornada720_lcd.c | 13 +------------ drivers/video/backlight/max8925_bl.c | 12 +----------- drivers/video/backlight/omap1_bl.c | 13 +------------ drivers/video/backlight/pcf50633-backlight.c | 12 +----------- drivers/video/backlight/platform_lcd.c | 13 +------------ drivers/video/backlight/pwm_bl.c | 12 +----------- drivers/video/backlight/wm831x_bl.c | 12 +----------- 13 files changed, 13 insertions(+), 148 deletions(-) diff --git a/drivers/video/backlight/88pm860x_bl.c b/drivers/video/backlight/88pm860x_bl.c index 1105fa1ed7f4a4..a1376dc73d71cd 100644 --- a/drivers/video/backlight/88pm860x_bl.c +++ b/drivers/video/backlight/88pm860x_bl.c @@ -270,17 +270,7 @@ static struct platform_driver pm860x_backlight_driver = { .remove = pm860x_backlight_remove, }; -static int __init pm860x_backlight_init(void) -{ - return platform_driver_register(&pm860x_backlight_driver); -} -module_init(pm860x_backlight_init); - -static void __exit pm860x_backlight_exit(void) -{ - platform_driver_unregister(&pm860x_backlight_driver); -} -module_exit(pm860x_backlight_exit); +module_platform_driver(pm860x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Marvell Semiconductor 88PM8606"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/video/backlight/adp5520_bl.c b/drivers/video/backlight/adp5520_bl.c index dfb763e9147ff9..2e630bf1164cab 100644 --- a/drivers/video/backlight/adp5520_bl.c +++ b/drivers/video/backlight/adp5520_bl.c @@ -384,17 +384,7 @@ static struct platform_driver adp5520_bl_driver = { .resume = adp5520_bl_resume, }; -static int __init adp5520_bl_init(void) -{ - return platform_driver_register(&adp5520_bl_driver); -} -module_init(adp5520_bl_init); - -static void __exit adp5520_bl_exit(void) -{ - platform_driver_unregister(&adp5520_bl_driver); -} -module_exit(adp5520_bl_exit); +module_platform_driver(adp5520_bl_driver); MODULE_AUTHOR("Michael Hennerich "); MODULE_DESCRIPTION("ADP5520(01) Backlight Driver"); diff --git a/drivers/video/backlight/da903x_bl.c b/drivers/video/backlight/da903x_bl.c index d68f14bbb687d8..abb4a06268f183 100644 --- a/drivers/video/backlight/da903x_bl.c +++ b/drivers/video/backlight/da903x_bl.c @@ -199,17 +199,7 @@ static struct platform_driver da903x_backlight_driver = { .remove = da903x_backlight_remove, }; -static int __init da903x_backlight_init(void) -{ - return platform_driver_register(&da903x_backlight_driver); -} -module_init(da903x_backlight_init); - -static void __exit da903x_backlight_exit(void) -{ - platform_driver_unregister(&da903x_backlight_driver); -} -module_exit(da903x_backlight_exit); +module_platform_driver(da903x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Dialog Semiconductor DA9030/DA9034"); MODULE_AUTHOR("Eric Miao " diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index c74a6f4baa127a..32b91677cd57ee 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -144,17 +144,7 @@ static struct platform_driver ep93xxbl_driver = { .resume = ep93xxbl_resume, }; -static int __init ep93xxbl_init(void) -{ - return platform_driver_register(&ep93xxbl_driver); -} -module_init(ep93xxbl_init); - -static void __exit ep93xxbl_exit(void) -{ - platform_driver_unregister(&ep93xxbl_driver); -} -module_exit(ep93xxbl_exit); +module_platform_driver(ep93xxbl_driver); MODULE_DESCRIPTION("EP93xx Backlight Driver"); MODULE_AUTHOR("H Hartley Sweeten "); diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c index adb191466d6463..9ce6170c186079 100644 --- a/drivers/video/backlight/generic_bl.c +++ b/drivers/video/backlight/generic_bl.c @@ -132,18 +132,7 @@ static struct platform_driver genericbl_driver = { }, }; -static int __init genericbl_init(void) -{ - return platform_driver_register(&genericbl_driver); -} - -static void __exit genericbl_exit(void) -{ - platform_driver_unregister(&genericbl_driver); -} - -module_init(genericbl_init); -module_exit(genericbl_exit); +module_platform_driver(genericbl_driver); MODULE_AUTHOR("Richard Purdie "); MODULE_DESCRIPTION("Generic Backlight Driver"); diff --git a/drivers/video/backlight/jornada720_bl.c b/drivers/video/backlight/jornada720_bl.c index de65d80159beed..2f8af5d786abbb 100644 --- a/drivers/video/backlight/jornada720_bl.c +++ b/drivers/video/backlight/jornada720_bl.c @@ -147,19 +147,8 @@ static struct platform_driver jornada_bl_driver = { }, }; -static int __init jornada_bl_init(void) -{ - return platform_driver_register(&jornada_bl_driver); -} - -static void __exit jornada_bl_exit(void) -{ - platform_driver_unregister(&jornada_bl_driver); -} +module_platform_driver(jornada_bl_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 710/720/728 Backlight driver"); MODULE_LICENSE("GPL"); - -module_init(jornada_bl_init); -module_exit(jornada_bl_exit); diff --git a/drivers/video/backlight/jornada720_lcd.c b/drivers/video/backlight/jornada720_lcd.c index d2ff658b4144f9..22d231a17e3c46 100644 --- a/drivers/video/backlight/jornada720_lcd.c +++ b/drivers/video/backlight/jornada720_lcd.c @@ -135,19 +135,8 @@ static struct platform_driver jornada_lcd_driver = { }, }; -static int __init jornada_lcd_init(void) -{ - return platform_driver_register(&jornada_lcd_driver); -} - -static void __exit jornada_lcd_exit(void) -{ - platform_driver_unregister(&jornada_lcd_driver); -} +module_platform_driver(jornada_lcd_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 710/720/728 LCD driver"); MODULE_LICENSE("GPL"); - -module_init(jornada_lcd_init); -module_exit(jornada_lcd_exit); diff --git a/drivers/video/backlight/max8925_bl.c b/drivers/video/backlight/max8925_bl.c index 7bbc802560ea04..c915e3b5388698 100644 --- a/drivers/video/backlight/max8925_bl.c +++ b/drivers/video/backlight/max8925_bl.c @@ -188,17 +188,7 @@ static struct platform_driver max8925_backlight_driver = { .remove = __devexit_p(max8925_backlight_remove), }; -static int __init max8925_backlight_init(void) -{ - return platform_driver_register(&max8925_backlight_driver); -} -module_init(max8925_backlight_init); - -static void __exit max8925_backlight_exit(void) -{ - platform_driver_unregister(&max8925_backlight_driver); -}; -module_exit(max8925_backlight_exit); +module_platform_driver(max8925_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for Maxim MAX8925"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/video/backlight/omap1_bl.c b/drivers/video/backlight/omap1_bl.c index 08d26a72394c8e..d8cde277ec83a6 100644 --- a/drivers/video/backlight/omap1_bl.c +++ b/drivers/video/backlight/omap1_bl.c @@ -195,18 +195,7 @@ static struct platform_driver omapbl_driver = { }, }; -static int __init omapbl_init(void) -{ - return platform_driver_register(&omapbl_driver); -} - -static void __exit omapbl_exit(void) -{ - platform_driver_unregister(&omapbl_driver); -} - -module_init(omapbl_init); -module_exit(omapbl_exit); +module_platform_driver(omapbl_driver); MODULE_AUTHOR("Andrzej Zaborowski "); MODULE_DESCRIPTION("OMAP LCD Backlight driver"); diff --git a/drivers/video/backlight/pcf50633-backlight.c b/drivers/video/backlight/pcf50633-backlight.c index ef5628d6056384..13e88b71daecc4 100644 --- a/drivers/video/backlight/pcf50633-backlight.c +++ b/drivers/video/backlight/pcf50633-backlight.c @@ -173,17 +173,7 @@ static struct platform_driver pcf50633_bl_driver = { }, }; -static int __init pcf50633_bl_init(void) -{ - return platform_driver_register(&pcf50633_bl_driver); -} -module_init(pcf50633_bl_init); - -static void __exit pcf50633_bl_exit(void) -{ - platform_driver_unregister(&pcf50633_bl_driver); -} -module_exit(pcf50633_bl_exit); +module_platform_driver(pcf50633_bl_driver); MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_DESCRIPTION("PCF50633 backlight driver"); diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index 302330acf6284e..187da59e3a13f6 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -157,18 +157,7 @@ static struct platform_driver platform_lcd_driver = { .resume = platform_lcd_resume, }; -static int __init platform_lcd_init(void) -{ - return platform_driver_register(&platform_lcd_driver); -} - -static void __exit platform_lcd_cleanup(void) -{ - platform_driver_unregister(&platform_lcd_driver); -} - -module_init(platform_lcd_init); -module_exit(platform_lcd_cleanup); +module_platform_driver(platform_lcd_driver); MODULE_AUTHOR("Ben Dooks "); MODULE_LICENSE("GPL v2"); diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index 8b5b2a4124c798..b811e8fb406273 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -207,17 +207,7 @@ static struct platform_driver pwm_backlight_driver = { .resume = pwm_backlight_resume, }; -static int __init pwm_backlight_init(void) -{ - return platform_driver_register(&pwm_backlight_driver); -} -module_init(pwm_backlight_init); - -static void __exit pwm_backlight_exit(void) -{ - platform_driver_unregister(&pwm_backlight_driver); -} -module_exit(pwm_backlight_exit); +module_platform_driver(pwm_backlight_driver); MODULE_DESCRIPTION("PWM based Backlight Driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/video/backlight/wm831x_bl.c b/drivers/video/backlight/wm831x_bl.c index fbe9e9316f3b51..4e915f5eca99b4 100644 --- a/drivers/video/backlight/wm831x_bl.c +++ b/drivers/video/backlight/wm831x_bl.c @@ -236,17 +236,7 @@ static struct platform_driver wm831x_backlight_driver = { .remove = wm831x_backlight_remove, }; -static int __init wm831x_backlight_init(void) -{ - return platform_driver_register(&wm831x_backlight_driver); -} -module_init(wm831x_backlight_init); - -static void __exit wm831x_backlight_exit(void) -{ - platform_driver_unregister(&wm831x_backlight_driver); -} -module_exit(wm831x_backlight_exit); +module_platform_driver(wm831x_backlight_driver); MODULE_DESCRIPTION("Backlight Driver for WM831x PMICs"); MODULE_AUTHOR("Mark Brown Date: Tue, 10 Jan 2012 15:09:15 -0800 Subject: [PATCH 064/124] backlight/ld9040.c: regulator control in the driver This patch supports regulator power control in the driver. Current ld9040 driver was controlled power on/off sequence by callback function in the board file. But, by doing this, there's no need to register lcd power on/off callback function in the board file. Signed-off-by: Donghwa Lee Signed-off-by: Kyungmin Park Signed-off-by: Inki Dae Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/ld9040.c | 71 ++++++++++++++++++++++++++------ 1 file changed, 59 insertions(+), 12 deletions(-) diff --git a/drivers/video/backlight/ld9040.c b/drivers/video/backlight/ld9040.c index da9a5ce0ccb846..78dafc0c8fc5a1 100644 --- a/drivers/video/backlight/ld9040.c +++ b/drivers/video/backlight/ld9040.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "ld9040_gamma.h" @@ -53,8 +54,51 @@ struct ld9040 { struct lcd_device *ld; struct backlight_device *bd; struct lcd_platform_data *lcd_pd; + + struct mutex lock; + bool enabled; +}; + +static struct regulator_bulk_data supplies[] = { + { .supply = "vdd3", }, + { .supply = "vci", }, }; +static void ld9040_regulator_enable(struct ld9040 *lcd) +{ + int ret = 0; + struct lcd_platform_data *pd = NULL; + + pd = lcd->lcd_pd; + mutex_lock(&lcd->lock); + if (!lcd->enabled) { + ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies); + if (ret) + goto out; + + lcd->enabled = true; + } + mdelay(pd->power_on_delay); +out: + mutex_unlock(&lcd->lock); +} + +static void ld9040_regulator_disable(struct ld9040 *lcd) +{ + int ret = 0; + + mutex_lock(&lcd->lock); + if (lcd->enabled) { + ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies); + if (ret) + goto out; + + lcd->enabled = false; + } +out: + mutex_unlock(&lcd->lock); +} + static const unsigned short seq_swreset[] = { 0x01, COMMAND_ONLY, ENDDEF, 0x00 @@ -532,13 +576,8 @@ static int ld9040_power_on(struct ld9040 *lcd) return -EFAULT; } - if (!pd->power_on) { - dev_err(lcd->dev, "power_on is NULL.\n"); - return -EFAULT; - } else { - pd->power_on(lcd->ld, 1); - mdelay(pd->power_on_delay); - } + /* lcd power on */ + ld9040_regulator_enable(lcd); if (!pd->reset) { dev_err(lcd->dev, "reset is NULL.\n"); @@ -582,11 +621,8 @@ static int ld9040_power_off(struct ld9040 *lcd) mdelay(pd->power_off_delay); - if (!pd->power_on) { - dev_err(lcd->dev, "power_on is NULL.\n"); - return -EFAULT; - } else - pd->power_on(lcd->ld, 0); + /* lcd power off */ + ld9040_regulator_disable(lcd); return 0; } @@ -693,6 +729,14 @@ static int ld9040_probe(struct spi_device *spi) goto out_free_lcd; } + mutex_init(&lcd->lock); + + ret = regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies); + if (ret) { + dev_err(lcd->dev, "Failed to get regulators: %d\n", ret); + goto out_free_lcd; + } + ld = lcd_device_register("ld9040", &spi->dev, lcd, &ld9040_lcd_ops); if (IS_ERR(ld)) { ret = PTR_ERR(ld); @@ -739,6 +783,8 @@ static int ld9040_probe(struct spi_device *spi) out_unregister_lcd: lcd_device_unregister(lcd->ld); out_free_lcd: + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); + kfree(lcd); return ret; } @@ -750,6 +796,7 @@ static int __devexit ld9040_remove(struct spi_device *spi) ld9040_power(lcd, FB_BLANK_POWERDOWN); backlight_device_unregister(lcd->bd); lcd_device_unregister(lcd->ld); + regulator_bulk_free(ARRAY_SIZE(supplies), supplies); kfree(lcd); return 0; From 1cfc6fee34a4343d79357c46722eb840fbc04f46 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 10 Jan 2012 15:09:18 -0800 Subject: [PATCH 065/124] drivers/video/backlight/ep93xx_bl.c: remove duplicated header include module.h is included twice. Signed-off-by: Jingoo Han Acked-by: H Hartley Sweeten Cc: Ryan Mallon Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/ep93xx_bl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/video/backlight/ep93xx_bl.c b/drivers/video/backlight/ep93xx_bl.c index 32b91677cd57ee..b62b8b9063b53b 100644 --- a/drivers/video/backlight/ep93xx_bl.c +++ b/drivers/video/backlight/ep93xx_bl.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include From 66655760bf38861299e3c8196f5303f886b0eef9 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Tue, 10 Jan 2012 15:09:19 -0800 Subject: [PATCH 066/124] backlight: use kstrtoul() The usage of simple_strtoul() or strict_strtoul() is not preferred. Thus, kstrtoul should be used. This patch also fixes checkpatch error as follows: ERROR: space required after that ',' (ctx:VxV) Signed-off-by: Jingoo Han Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/backlight.c | 6 +++--- drivers/video/backlight/lcd.c | 26 ++++++++++---------------- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 7363c1b169e8f5..bf5b1ece71605d 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -102,7 +102,7 @@ static void backlight_generate_event(struct backlight_device *bd, } static ssize_t backlight_show_power(struct device *dev, - struct device_attribute *attr,char *buf) + struct device_attribute *attr, char *buf) { struct backlight_device *bd = to_backlight_device(dev); @@ -116,7 +116,7 @@ static ssize_t backlight_store_power(struct device *dev, struct backlight_device *bd = to_backlight_device(dev); unsigned long power; - rc = strict_strtoul(buf, 0, &power); + rc = kstrtoul(buf, 0, &power); if (rc) return rc; @@ -150,7 +150,7 @@ static ssize_t backlight_store_brightness(struct device *dev, struct backlight_device *bd = to_backlight_device(dev); unsigned long brightness; - rc = strict_strtoul(buf, 0, &brightness); + rc = kstrtoul(buf, 0, &brightness); if (rc) return rc; diff --git a/drivers/video/backlight/lcd.c b/drivers/video/backlight/lcd.c index 71a11cadffc481..79c1b0d609a809 100644 --- a/drivers/video/backlight/lcd.c +++ b/drivers/video/backlight/lcd.c @@ -97,19 +97,16 @@ static ssize_t lcd_store_power(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc = -ENXIO; - char *endp; struct lcd_device *ld = to_lcd_device(dev); - int power = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long power; - if (isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = kstrtoul(buf, 0, &power); + if (rc) + return rc; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_power) { - pr_debug("lcd: set power to %d\n", power); + pr_debug("lcd: set power to %lu\n", power); ld->ops->set_power(ld, power); rc = count; } @@ -136,19 +133,16 @@ static ssize_t lcd_store_contrast(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { int rc = -ENXIO; - char *endp; struct lcd_device *ld = to_lcd_device(dev); - int contrast = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long contrast; - if (isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = kstrtoul(buf, 0, &contrast); + if (rc) + return rc; mutex_lock(&ld->ops_lock); if (ld->ops && ld->ops->set_contrast) { - pr_debug("lcd: set contrast to %d\n", contrast); + pr_debug("lcd: set contrast to %lu\n", contrast); ld->ops->set_contrast(ld, contrast); rc = count; } From 48e78e8cc87ab80617ef0c5a146701ca96a4a51d Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:21 -0800 Subject: [PATCH 067/124] backlight: convert platform_lcd to devm_kzalloc() Saves some error handling code and eliminates a class of leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/platform_lcd.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/video/backlight/platform_lcd.c b/drivers/video/backlight/platform_lcd.c index 187da59e3a13f6..f0bf491ed087a5 100644 --- a/drivers/video/backlight/platform_lcd.c +++ b/drivers/video/backlight/platform_lcd.c @@ -85,7 +85,8 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) return -EINVAL; } - plcd = kzalloc(sizeof(struct platform_lcd), GFP_KERNEL); + plcd = devm_kzalloc(&pdev->dev, sizeof(struct platform_lcd), + GFP_KERNEL); if (!plcd) { dev_err(dev, "no memory for state\n"); return -ENOMEM; @@ -98,7 +99,7 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) if (IS_ERR(plcd->lcd)) { dev_err(dev, "cannot register lcd device\n"); err = PTR_ERR(plcd->lcd); - goto err_mem; + goto err; } platform_set_drvdata(pdev, plcd); @@ -106,8 +107,7 @@ static int __devinit platform_lcd_probe(struct platform_device *pdev) return 0; - err_mem: - kfree(plcd); + err: return err; } @@ -116,7 +116,6 @@ static int __devexit platform_lcd_remove(struct platform_device *pdev) struct platform_lcd *plcd = platform_get_drvdata(pdev); lcd_device_unregister(plcd->lcd); - kfree(plcd); return 0; } From e2c17bc6f717a8847df2a867caec6ba4fe85f3fc Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:22 -0800 Subject: [PATCH 068/124] backlight: convert pwm_bl to dev_pm_ops Should be no functional changes, mainly a reorganisation to support future work. [akpm@linux-foundation.org: fix CONFIG_PM=n build] Signed-off-by: Mark Brown Cc: Richard Purdie Cc: Florian Tobias Schandinat Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/pwm_bl.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index b811e8fb406273..7496d04e1d3c1f 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -169,10 +169,9 @@ static int pwm_backlight_remove(struct platform_device *pdev) } #ifdef CONFIG_PM -static int pwm_backlight_suspend(struct platform_device *pdev, - pm_message_t state) +static int pwm_backlight_suspend(struct device *dev) { - struct backlight_device *bl = platform_get_drvdata(pdev); + struct backlight_device *bl = dev_get_drvdata(dev); struct pwm_bl_data *pb = dev_get_drvdata(&bl->dev); if (pb->notify) @@ -184,27 +183,29 @@ static int pwm_backlight_suspend(struct platform_device *pdev, return 0; } -static int pwm_backlight_resume(struct platform_device *pdev) +static int pwm_backlight_resume(struct device *dev) { - struct backlight_device *bl = platform_get_drvdata(pdev); + struct backlight_device *bl = dev_get_drvdata(dev); backlight_update_status(bl); return 0; } -#else -#define pwm_backlight_suspend NULL -#define pwm_backlight_resume NULL + +static SIMPLE_DEV_PM_OPS(pwm_backlight_pm_ops, pwm_backlight_suspend, + pwm_backlight_resume); + #endif static struct platform_driver pwm_backlight_driver = { .driver = { .name = "pwm-backlight", .owner = THIS_MODULE, +#ifdef CONFIG_PM + .pm = &pwm_backlight_pm_ops, +#endif }, .probe = pwm_backlight_probe, .remove = pwm_backlight_remove, - .suspend = pwm_backlight_suspend, - .resume = pwm_backlight_resume, }; module_platform_driver(pwm_backlight_driver); From 892a8843fbef07a7f2ab62d5f7ff5c16ea0903b0 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:24 -0800 Subject: [PATCH 069/124] leds: convert led platform drivers to module_platform_driver Factor out some boilerplate code for platform driver registration into module_platform_driver. Signed-off-by: Axel Lin Acked-by: Haojian Zhuang [led-88pm860x.c] Acked-by: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Cc: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-88pm860x.c | 12 +----------- drivers/leds/leds-adp5520.c | 12 +----------- drivers/leds/leds-ams-delta.c | 13 +------------ drivers/leds/leds-asic3.c | 16 ++-------------- drivers/leds/leds-atmel-pwm.c | 17 +++-------------- drivers/leds/leds-cobalt-qube.c | 17 ++--------------- drivers/leds/leds-da903x.c | 12 +----------- drivers/leds/leds-fsg.c | 15 +-------------- drivers/leds/leds-gpio.c | 16 ++-------------- drivers/leds/leds-hp6xx.c | 17 ++--------------- drivers/leds/leds-lt3593.c | 16 ++-------------- drivers/leds/leds-mc13783.c | 12 +----------- drivers/leds/leds-netxbig.c | 15 ++------------- drivers/leds/leds-ns2.c | 15 ++------------- drivers/leds/leds-pwm.c | 13 +------------ drivers/leds/leds-rb532.c | 16 ++-------------- drivers/leds/leds-regulator.c | 12 +----------- drivers/leds/leds-renesas-tpu.c | 13 +------------ drivers/leds/leds-s3c24xx.c | 13 +------------ drivers/leds/leds-wm831x-status.c | 12 +----------- drivers/leds/leds-wm8350.c | 12 +----------- 21 files changed, 31 insertions(+), 265 deletions(-) diff --git a/drivers/leds/leds-88pm860x.c b/drivers/leds/leds-88pm860x.c index 0810604dc70130..4ca00624bd1860 100644 --- a/drivers/leds/leds-88pm860x.c +++ b/drivers/leds/leds-88pm860x.c @@ -238,17 +238,7 @@ static struct platform_driver pm860x_led_driver = { .remove = pm860x_led_remove, }; -static int __devinit pm860x_led_init(void) -{ - return platform_driver_register(&pm860x_led_driver); -} -module_init(pm860x_led_init); - -static void __devexit pm860x_led_exit(void) -{ - platform_driver_unregister(&pm860x_led_driver); -} -module_exit(pm860x_led_exit); +module_platform_driver(pm860x_led_driver); MODULE_DESCRIPTION("LED driver for Marvell PM860x"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/leds/leds-adp5520.c b/drivers/leds/leds-adp5520.c index 7ba4c7b5b97e07..b1400db3f839a2 100644 --- a/drivers/leds/leds-adp5520.c +++ b/drivers/leds/leds-adp5520.c @@ -213,17 +213,7 @@ static struct platform_driver adp5520_led_driver = { .remove = __devexit_p(adp5520_led_remove), }; -static int __init adp5520_led_init(void) -{ - return platform_driver_register(&adp5520_led_driver); -} -module_init(adp5520_led_init); - -static void __exit adp5520_led_exit(void) -{ - platform_driver_unregister(&adp5520_led_driver); -} -module_exit(adp5520_led_exit); +module_platform_driver(adp5520_led_driver); MODULE_AUTHOR("Michael Hennerich "); MODULE_DESCRIPTION("LEDS ADP5520(01) Driver"); diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c index 8c00937bf7e74d..07428357c83fd4 100644 --- a/drivers/leds/leds-ams-delta.c +++ b/drivers/leds/leds-ams-delta.c @@ -118,18 +118,7 @@ static struct platform_driver ams_delta_led_driver = { }, }; -static int __init ams_delta_led_init(void) -{ - return platform_driver_register(&ams_delta_led_driver); -} - -static void __exit ams_delta_led_exit(void) -{ - platform_driver_unregister(&ams_delta_led_driver); -} - -module_init(ams_delta_led_init); -module_exit(ams_delta_led_exit); +module_platform_driver(ams_delta_led_driver); MODULE_AUTHOR("Jonathan McDowell "); MODULE_DESCRIPTION("Amstrad Delta LED driver"); diff --git a/drivers/leds/leds-asic3.c b/drivers/leds/leds-asic3.c index 48d9fe61bdfcdc..525a92492837bb 100644 --- a/drivers/leds/leds-asic3.c +++ b/drivers/leds/leds-asic3.c @@ -179,21 +179,9 @@ static struct platform_driver asic3_led_driver = { }, }; -MODULE_ALIAS("platform:leds-asic3"); - -static int __init asic3_led_init(void) -{ - return platform_driver_register(&asic3_led_driver); -} - -static void __exit asic3_led_exit(void) -{ - platform_driver_unregister(&asic3_led_driver); -} - -module_init(asic3_led_init); -module_exit(asic3_led_exit); +module_platform_driver(asic3_led_driver); MODULE_AUTHOR("Paul Parsons "); MODULE_DESCRIPTION("HTC ASIC3 LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-asic3"); diff --git a/drivers/leds/leds-atmel-pwm.c b/drivers/leds/leds-atmel-pwm.c index 109c875ea23348..800243b6037ed9 100644 --- a/drivers/leds/leds-atmel-pwm.c +++ b/drivers/leds/leds-atmel-pwm.c @@ -134,29 +134,18 @@ static int __exit pwmled_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:leds-atmel-pwm"); - static struct platform_driver pwmled_driver = { .driver = { .name = "leds-atmel-pwm", .owner = THIS_MODULE, }, /* REVISIT add suspend() and resume() methods */ + .probe = pwmled_probe, .remove = __exit_p(pwmled_remove), }; -static int __init modinit(void) -{ - return platform_driver_probe(&pwmled_driver, pwmled_probe); -} -module_init(modinit); - -static void __exit modexit(void) -{ - platform_driver_unregister(&pwmled_driver); -} -module_exit(modexit); +module_platform_driver(pwmled_driver); MODULE_DESCRIPTION("Driver for LEDs with PWM-controlled brightness"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-atmel-pwm"); diff --git a/drivers/leds/leds-cobalt-qube.c b/drivers/leds/leds-cobalt-qube.c index da5fb016b1a550..6a8725cc7b4dfe 100644 --- a/drivers/leds/leds-cobalt-qube.c +++ b/drivers/leds/leds-cobalt-qube.c @@ -75,9 +75,6 @@ static int __devexit cobalt_qube_led_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:cobalt-qube-leds"); - static struct platform_driver cobalt_qube_led_driver = { .probe = cobalt_qube_led_probe, .remove = __devexit_p(cobalt_qube_led_remove), @@ -87,19 +84,9 @@ static struct platform_driver cobalt_qube_led_driver = { }, }; -static int __init cobalt_qube_led_init(void) -{ - return platform_driver_register(&cobalt_qube_led_driver); -} - -static void __exit cobalt_qube_led_exit(void) -{ - platform_driver_unregister(&cobalt_qube_led_driver); -} - -module_init(cobalt_qube_led_init); -module_exit(cobalt_qube_led_exit); +module_platform_driver(cobalt_qube_led_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Front LED support for Cobalt Server"); MODULE_AUTHOR("Florian Fainelli "); +MODULE_ALIAS("platform:cobalt-qube-leds"); diff --git a/drivers/leds/leds-da903x.c b/drivers/leds/leds-da903x.c index f28931cf678104..d9cd73ebd6c44c 100644 --- a/drivers/leds/leds-da903x.c +++ b/drivers/leds/leds-da903x.c @@ -158,17 +158,7 @@ static struct platform_driver da903x_led_driver = { .remove = __devexit_p(da903x_led_remove), }; -static int __init da903x_led_init(void) -{ - return platform_driver_register(&da903x_led_driver); -} -module_init(da903x_led_init); - -static void __exit da903x_led_exit(void) -{ - platform_driver_unregister(&da903x_led_driver); -} -module_exit(da903x_led_exit); +module_platform_driver(da903x_led_driver); MODULE_DESCRIPTION("LEDs driver for Dialog Semiconductor DA9030/DA9034"); MODULE_AUTHOR("Eric Miao " diff --git a/drivers/leds/leds-fsg.c b/drivers/leds/leds-fsg.c index 49aceffaa5b6b7..b9053fa6e25343 100644 --- a/drivers/leds/leds-fsg.c +++ b/drivers/leds/leds-fsg.c @@ -224,20 +224,7 @@ static struct platform_driver fsg_led_driver = { }, }; - -static int __init fsg_led_init(void) -{ - return platform_driver_register(&fsg_led_driver); -} - -static void __exit fsg_led_exit(void) -{ - platform_driver_unregister(&fsg_led_driver); -} - - -module_init(fsg_led_init); -module_exit(fsg_led_exit); +module_platform_driver(fsg_led_driver); MODULE_AUTHOR("Rod Whitby "); MODULE_DESCRIPTION("Freecom FSG-3 LED driver"); diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index 399a86f2013a14..7df74cb97e702e 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -293,21 +293,9 @@ static struct platform_driver gpio_led_driver = { }, }; -MODULE_ALIAS("platform:leds-gpio"); - -static int __init gpio_led_init(void) -{ - return platform_driver_register(&gpio_led_driver); -} - -static void __exit gpio_led_exit(void) -{ - platform_driver_unregister(&gpio_led_driver); -} - -module_init(gpio_led_init); -module_exit(gpio_led_exit); +module_platform_driver(gpio_led_driver); MODULE_AUTHOR("Raphael Assenat , Trent Piepho "); MODULE_DESCRIPTION("GPIO LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-gpio"); diff --git a/drivers/leds/leds-hp6xx.c b/drivers/leds/leds-hp6xx.c index bcfbd3a60eab6b..366b6055e33063 100644 --- a/drivers/leds/leds-hp6xx.c +++ b/drivers/leds/leds-hp6xx.c @@ -79,9 +79,6 @@ static int hp6xxled_remove(struct platform_device *pdev) return 0; } -/* work with hotplug and coldplug */ -MODULE_ALIAS("platform:hp6xx-led"); - static struct platform_driver hp6xxled_driver = { .probe = hp6xxled_probe, .remove = hp6xxled_remove, @@ -91,19 +88,9 @@ static struct platform_driver hp6xxled_driver = { }, }; -static int __init hp6xxled_init(void) -{ - return platform_driver_register(&hp6xxled_driver); -} - -static void __exit hp6xxled_exit(void) -{ - platform_driver_unregister(&hp6xxled_driver); -} - -module_init(hp6xxled_init); -module_exit(hp6xxled_exit); +module_platform_driver(hp6xxled_driver); MODULE_AUTHOR("Kristoffer Ericson "); MODULE_DESCRIPTION("HP Jornada 6xx LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:hp6xx-led"); diff --git a/drivers/leds/leds-lt3593.c b/drivers/leds/leds-lt3593.c index 53f67b8ce55db5..e311a96c446975 100644 --- a/drivers/leds/leds-lt3593.c +++ b/drivers/leds/leds-lt3593.c @@ -199,21 +199,9 @@ static struct platform_driver lt3593_led_driver = { }, }; -MODULE_ALIAS("platform:leds-lt3593"); - -static int __init lt3593_led_init(void) -{ - return platform_driver_register(<3593_led_driver); -} - -static void __exit lt3593_led_exit(void) -{ - platform_driver_unregister(<3593_led_driver); -} - -module_init(lt3593_led_init); -module_exit(lt3593_led_exit); +module_platform_driver(lt3593_led_driver); MODULE_AUTHOR("Daniel Mack "); MODULE_DESCRIPTION("LED driver for LT3593 controllers"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-lt3593"); diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c index b3393a9f21398c..c61e8c4f54694d 100644 --- a/drivers/leds/leds-mc13783.c +++ b/drivers/leds/leds-mc13783.c @@ -385,17 +385,7 @@ static struct platform_driver mc13783_led_driver = { .remove = __devexit_p(mc13783_led_remove), }; -static int __init mc13783_led_init(void) -{ - return platform_driver_register(&mc13783_led_driver); -} -module_init(mc13783_led_init); - -static void __exit mc13783_led_exit(void) -{ - platform_driver_unregister(&mc13783_led_driver); -} -module_exit(mc13783_led_exit); +module_platform_driver(mc13783_led_driver); MODULE_DESCRIPTION("LEDs driver for Freescale MC13783 PMIC"); MODULE_AUTHOR("Philippe Retornaz "); diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index f2e51c13439962..8c7a4ea10dc30c 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -429,21 +429,10 @@ static struct platform_driver netxbig_led_driver = { .owner = THIS_MODULE, }, }; -MODULE_ALIAS("platform:leds-netxbig"); - -static int __init netxbig_led_init(void) -{ - return platform_driver_register(&netxbig_led_driver); -} -static void __exit netxbig_led_exit(void) -{ - platform_driver_unregister(&netxbig_led_driver); -} - -module_init(netxbig_led_init); -module_exit(netxbig_led_exit); +module_platform_driver(netxbig_led_driver); MODULE_AUTHOR("Simon Guinot "); MODULE_DESCRIPTION("LED driver for LaCie xBig Network boards"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-netxbig"); diff --git a/drivers/leds/leds-ns2.c b/drivers/leds/leds-ns2.c index 37b7d0cfe5867e..2f0a14421a7344 100644 --- a/drivers/leds/leds-ns2.c +++ b/drivers/leds/leds-ns2.c @@ -323,21 +323,10 @@ static struct platform_driver ns2_led_driver = { .owner = THIS_MODULE, }, }; -MODULE_ALIAS("platform:leds-ns2"); - -static int __init ns2_led_init(void) -{ - return platform_driver_register(&ns2_led_driver); -} -static void __exit ns2_led_exit(void) -{ - platform_driver_unregister(&ns2_led_driver); -} - -module_init(ns2_led_init); -module_exit(ns2_led_exit); +module_platform_driver(ns2_led_driver); MODULE_AUTHOR("Simon Guinot "); MODULE_DESCRIPTION("Network Space v2 LED driver"); MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:leds-ns2"); diff --git a/drivers/leds/leds-pwm.c b/drivers/leds/leds-pwm.c index 666daf77872e56..3ed92f34bd4477 100644 --- a/drivers/leds/leds-pwm.c +++ b/drivers/leds/leds-pwm.c @@ -135,18 +135,7 @@ static struct platform_driver led_pwm_driver = { }, }; -static int __init led_pwm_init(void) -{ - return platform_driver_register(&led_pwm_driver); -} - -static void __exit led_pwm_exit(void) -{ - platform_driver_unregister(&led_pwm_driver); -} - -module_init(led_pwm_init); -module_exit(led_pwm_exit); +module_platform_driver(led_pwm_driver); MODULE_AUTHOR("Luotao Fu "); MODULE_DESCRIPTION("PWM LED driver for PXA"); diff --git a/drivers/leds/leds-rb532.c b/drivers/leds/leds-rb532.c index c3525f37f73d68..a7815b6cd8567d 100644 --- a/drivers/leds/leds-rb532.c +++ b/drivers/leds/leds-rb532.c @@ -57,21 +57,9 @@ static struct platform_driver rb532_led_driver = { }, }; -static int __init rb532_led_init(void) -{ - return platform_driver_register(&rb532_led_driver); -} - -static void __exit rb532_led_exit(void) -{ - platform_driver_unregister(&rb532_led_driver); -} - -module_init(rb532_led_init); -module_exit(rb532_led_exit); - -MODULE_ALIAS("platform:rb532-led"); +module_platform_driver(rb532_led_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("User LED support for Routerboard532"); MODULE_AUTHOR("Phil Sutter "); +MODULE_ALIAS("platform:rb532-led"); diff --git a/drivers/leds/leds-regulator.c b/drivers/leds/leds-regulator.c index 8497f56f8e461c..df7e963bddd304 100644 --- a/drivers/leds/leds-regulator.c +++ b/drivers/leds/leds-regulator.c @@ -229,17 +229,7 @@ static struct platform_driver regulator_led_driver = { .remove = __devexit_p(regulator_led_remove), }; -static int __init regulator_led_init(void) -{ - return platform_driver_register(®ulator_led_driver); -} -module_init(regulator_led_init); - -static void __exit regulator_led_exit(void) -{ - platform_driver_unregister(®ulator_led_driver); -} -module_exit(regulator_led_exit); +module_platform_driver(regulator_led_driver); MODULE_AUTHOR("Antonio Ospite "); MODULE_DESCRIPTION("Regulator driven LED driver"); diff --git a/drivers/leds/leds-renesas-tpu.c b/drivers/leds/leds-renesas-tpu.c index 3ee540eb127e7f..32fe337d5c687f 100644 --- a/drivers/leds/leds-renesas-tpu.c +++ b/drivers/leds/leds-renesas-tpu.c @@ -339,18 +339,7 @@ static struct platform_driver r_tpu_device_driver = { } }; -static int __init r_tpu_init(void) -{ - return platform_driver_register(&r_tpu_device_driver); -} - -static void __exit r_tpu_exit(void) -{ - platform_driver_unregister(&r_tpu_device_driver); -} - -module_init(r_tpu_init); -module_exit(r_tpu_exit); +module_platform_driver(r_tpu_device_driver); MODULE_AUTHOR("Magnus Damm"); MODULE_DESCRIPTION("Renesas TPU LED Driver"); diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c index 29f8b0f0e2c6f8..bd0a5ed49c42d9 100644 --- a/drivers/leds/leds-s3c24xx.c +++ b/drivers/leds/leds-s3c24xx.c @@ -121,18 +121,7 @@ static struct platform_driver s3c24xx_led_driver = { }, }; -static int __init s3c24xx_led_init(void) -{ - return platform_driver_register(&s3c24xx_led_driver); -} - -static void __exit s3c24xx_led_exit(void) -{ - platform_driver_unregister(&s3c24xx_led_driver); -} - -module_init(s3c24xx_led_init); -module_exit(s3c24xx_led_exit); +module_platform_driver(s3c24xx_led_driver); MODULE_AUTHOR("Ben Dooks "); MODULE_DESCRIPTION("S3C24XX LED driver"); diff --git a/drivers/leds/leds-wm831x-status.c b/drivers/leds/leds-wm831x-status.c index b1eb34c3e81f41..444a68d8e17e1f 100644 --- a/drivers/leds/leds-wm831x-status.c +++ b/drivers/leds/leds-wm831x-status.c @@ -325,17 +325,7 @@ static struct platform_driver wm831x_status_driver = { .remove = wm831x_status_remove, }; -static int __devinit wm831x_status_init(void) -{ - return platform_driver_register(&wm831x_status_driver); -} -module_init(wm831x_status_init); - -static void wm831x_status_exit(void) -{ - platform_driver_unregister(&wm831x_status_driver); -} -module_exit(wm831x_status_exit); +module_platform_driver(wm831x_status_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("WM831x status LED driver"); diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c index 4a127657835228..390c0f679628f7 100644 --- a/drivers/leds/leds-wm8350.c +++ b/drivers/leds/leds-wm8350.c @@ -295,17 +295,7 @@ static struct platform_driver wm8350_led_driver = { .shutdown = wm8350_led_shutdown, }; -static int __devinit wm8350_led_init(void) -{ - return platform_driver_register(&wm8350_led_driver); -} -module_init(wm8350_led_init); - -static void wm8350_led_exit(void) -{ - platform_driver_unregister(&wm8350_led_driver); -} -module_exit(wm8350_led_exit); +module_platform_driver(wm8350_led_driver); MODULE_AUTHOR("Mark Brown"); MODULE_DESCRIPTION("WM8350 LED driver"); From 09a0d183ef3d310ee9d0b835d9db741fda9d6d46 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:27 -0800 Subject: [PATCH 070/124] leds: convert led i2c drivers to module_i2c_driver Factor out some boilerplate code for i2c driver registration into module_i2c_driver. Signed-off-by: Axel Lin Cc: Haojian Zhuang Cc: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Cc: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-bd2802.c | 12 +----------- drivers/leds/leds-lm3530.c | 13 +------------ drivers/leds/leds-lp3944.c | 13 +------------ drivers/leds/leds-lp5521.c | 20 +------------------- drivers/leds/leds-lp5523.c | 20 +------------------- drivers/leds/leds-pca9532.c | 14 +------------- drivers/leds/leds-pca955x.c | 13 +------------ 7 files changed, 7 insertions(+), 98 deletions(-) diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c index ea2185531f826e..10e40971ffbb56 100644 --- a/drivers/leds/leds-bd2802.c +++ b/drivers/leds/leds-bd2802.c @@ -813,17 +813,7 @@ static struct i2c_driver bd2802_i2c_driver = { .id_table = bd2802_id, }; -static int __init bd2802_init(void) -{ - return i2c_add_driver(&bd2802_i2c_driver); -} -module_init(bd2802_init); - -static void __exit bd2802_exit(void) -{ - i2c_del_driver(&bd2802_i2c_driver); -} -module_exit(bd2802_exit); +module_i2c_driver(bd2802_i2c_driver); MODULE_AUTHOR("Kim Kyuwon "); MODULE_DESCRIPTION("BD2802 LED driver"); diff --git a/drivers/leds/leds-lm3530.c b/drivers/leds/leds-lm3530.c index 0630e4f4b2866a..45e6878d73741d 100644 --- a/drivers/leds/leds-lm3530.c +++ b/drivers/leds/leds-lm3530.c @@ -457,18 +457,7 @@ static struct i2c_driver lm3530_i2c_driver = { }, }; -static int __init lm3530_init(void) -{ - return i2c_add_driver(&lm3530_i2c_driver); -} - -static void __exit lm3530_exit(void) -{ - i2c_del_driver(&lm3530_i2c_driver); -} - -module_init(lm3530_init); -module_exit(lm3530_exit); +module_i2c_driver(lm3530_i2c_driver); MODULE_DESCRIPTION("Back Light driver for LM3530"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/leds/leds-lp3944.c b/drivers/leds/leds-lp3944.c index 9010c054615e41..b8f9f0a5d4318d 100644 --- a/drivers/leds/leds-lp3944.c +++ b/drivers/leds/leds-lp3944.c @@ -453,18 +453,7 @@ static struct i2c_driver lp3944_driver = { .id_table = lp3944_id, }; -static int __init lp3944_module_init(void) -{ - return i2c_add_driver(&lp3944_driver); -} - -static void __exit lp3944_module_exit(void) -{ - i2c_del_driver(&lp3944_driver); -} - -module_init(lp3944_module_init); -module_exit(lp3944_module_exit); +module_i2c_driver(lp3944_driver); MODULE_AUTHOR("Antonio Ospite "); MODULE_DESCRIPTION("LP3944 Fun Light Chip"); diff --git a/drivers/leds/leds-lp5521.c b/drivers/leds/leds-lp5521.c index cb641f1b33429a..d62a7982a5e66a 100644 --- a/drivers/leds/leds-lp5521.c +++ b/drivers/leds/leds-lp5521.c @@ -797,25 +797,7 @@ static struct i2c_driver lp5521_driver = { .id_table = lp5521_id, }; -static int __init lp5521_init(void) -{ - int ret; - - ret = i2c_add_driver(&lp5521_driver); - - if (ret < 0) - printk(KERN_ALERT "Adding lp5521 driver failed\n"); - - return ret; -} - -static void __exit lp5521_exit(void) -{ - i2c_del_driver(&lp5521_driver); -} - -module_init(lp5521_init); -module_exit(lp5521_exit); +module_i2c_driver(lp5521_driver); MODULE_AUTHOR("Mathias Nyman, Yuri Zaporozhets, Samu Onkalo"); MODULE_DESCRIPTION("LP5521 LED engine"); diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c index 5971e309b23423..0170760badbb91 100644 --- a/drivers/leds/leds-lp5523.c +++ b/drivers/leds/leds-lp5523.c @@ -1021,25 +1021,7 @@ static struct i2c_driver lp5523_driver = { .id_table = lp5523_id, }; -static int __init lp5523_init(void) -{ - int ret; - - ret = i2c_add_driver(&lp5523_driver); - - if (ret < 0) - printk(KERN_ALERT "Adding lp5523 driver failed\n"); - - return ret; -} - -static void __exit lp5523_exit(void) -{ - i2c_del_driver(&lp5523_driver); -} - -module_init(lp5523_init); -module_exit(lp5523_exit); +module_i2c_driver(lp5523_driver); MODULE_AUTHOR("Mathias Nyman "); MODULE_DESCRIPTION("LP5523 LED engine"); diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c index a2c874623e3521..ceccab44b5b818 100644 --- a/drivers/leds/leds-pca9532.c +++ b/drivers/leds/leds-pca9532.c @@ -489,20 +489,8 @@ static int pca9532_remove(struct i2c_client *client) return 0; } -static int __init pca9532_init(void) -{ - return i2c_add_driver(&pca9532_driver); -} - -static void __exit pca9532_exit(void) -{ - i2c_del_driver(&pca9532_driver); -} +module_i2c_driver(pca9532_driver); MODULE_AUTHOR("Riku Voipio"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("PCA 9532 LED dimmer"); - -module_init(pca9532_init); -module_exit(pca9532_exit); - diff --git a/drivers/leds/leds-pca955x.c b/drivers/leds/leds-pca955x.c index 66aa3e8e786f54..dcc3bc3d38db8c 100644 --- a/drivers/leds/leds-pca955x.c +++ b/drivers/leds/leds-pca955x.c @@ -371,18 +371,7 @@ static struct i2c_driver pca955x_driver = { .id_table = pca955x_id, }; -static int __init pca955x_leds_init(void) -{ - return i2c_add_driver(&pca955x_driver); -} - -static void __exit pca955x_leds_exit(void) -{ - i2c_del_driver(&pca955x_driver); -} - -module_init(pca955x_leds_init); -module_exit(pca955x_leds_exit); +module_i2c_driver(pca955x_driver); MODULE_AUTHOR("Nate Case "); MODULE_DESCRIPTION("PCA955x LED driver"); From 19b955768b4ede7c9ad0efe4def70852c530d4f9 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:30 -0800 Subject: [PATCH 071/124] leds: convert leds-dac124s085 to module_spi_driver Factor out some boilerplate code for spi driver registration into module_spi_driver. Signed-off-by: Axel Lin Cc: Haojian Zhuang Cc: Mark Brown Cc: Richard Purdie Cc: Michael Hennerich Cc: Mike Rapoport Acked-by: Guennadi Liakhovetski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-dac124s085.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/drivers/leds/leds-dac124s085.c b/drivers/leds/leds-dac124s085.c index 31cf0d60a9a546..d56c14269ff0c7 100644 --- a/drivers/leds/leds-dac124s085.c +++ b/drivers/leds/leds-dac124s085.c @@ -131,18 +131,7 @@ static struct spi_driver dac124s085_driver = { }, }; -static int __init dac124s085_leds_init(void) -{ - return spi_register_driver(&dac124s085_driver); -} - -static void __exit dac124s085_leds_exit(void) -{ - spi_unregister_driver(&dac124s085_driver); -} - -module_init(dac124s085_leds_init); -module_exit(dac124s085_leds_exit); +module_spi_driver(dac124s085_driver); MODULE_AUTHOR("Guennadi Liakhovetski "); MODULE_DESCRIPTION("DAC124S085 LED driver"); From 1980bcfa6bc5a77491176ba695422e205dcfd2da Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:34 -0800 Subject: [PATCH 072/124] drivers/leds/leds-lp5523.c: remove unneeded forward declaration Signed-off-by: Axel Lin Cc: Samu Onkalo Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-lp5523.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/leds/leds-lp5523.c b/drivers/leds/leds-lp5523.c index 0170760badbb91..73e791ae725993 100644 --- a/drivers/leds/leds-lp5523.c +++ b/drivers/leds/leds-lp5523.c @@ -870,8 +870,6 @@ static int __devinit lp5523_init_led(struct lp5523_led *led, struct device *dev, return 0; } -static struct i2c_driver lp5523_driver; - static int __devinit lp5523_probe(struct i2c_client *client, const struct i2c_device_id *id) { From 95dafd475382740a841697a2ead6566175d26390 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:35 -0800 Subject: [PATCH 073/124] drivers/leds/leds-bd2802.c: use gpio_request_one() Use gpio_request_one() instead of multiple gpiolib calls. Signed-off-by: Axel Lin Cc: Kim Kyuwon Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-bd2802.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/leds/leds-bd2802.c b/drivers/leds/leds-bd2802.c index 10e40971ffbb56..591cbdf5a0463e 100644 --- a/drivers/leds/leds-bd2802.c +++ b/drivers/leds/leds-bd2802.c @@ -688,8 +688,7 @@ static int __devinit bd2802_probe(struct i2c_client *client, i2c_set_clientdata(client, led); /* Configure RESET GPIO (L: RESET, H: RESET cancel) */ - gpio_request(pdata->reset_gpio, "RGB_RESETB"); - gpio_direction_output(pdata->reset_gpio, 1); + gpio_request_one(pdata->reset_gpio, GPIOF_OUT_INIT_HIGH, "RGB_RESETB"); /* Tacss = min 0.1ms */ udelay(100); From b96a573f4c27529d379922670e8cf5530120d5ca Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:37 -0800 Subject: [PATCH 074/124] drivers/leds/leds-netxbig.c: use gpio_request_one() Use gpio_request_one() instead of multiple gpiolib calls. This also simplifies error handling a bit. Signed-off-by: Axel Lin Cc: Simon Guinot Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-netxbig.c | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/drivers/leds/leds-netxbig.c b/drivers/leds/leds-netxbig.c index 8c7a4ea10dc30c..d8433f2d53bc71 100644 --- a/drivers/leds/leds-netxbig.c +++ b/drivers/leds/leds-netxbig.c @@ -81,35 +81,23 @@ static int __devinit gpio_ext_init(struct netxbig_gpio_ext *gpio_ext) /* Configure address GPIOs. */ for (i = 0; i < gpio_ext->num_addr; i++) { - err = gpio_request(gpio_ext->addr[i], "GPIO extension addr"); + err = gpio_request_one(gpio_ext->addr[i], GPIOF_OUT_INIT_LOW, + "GPIO extension addr"); if (err) goto err_free_addr; - err = gpio_direction_output(gpio_ext->addr[i], 0); - if (err) { - gpio_free(gpio_ext->addr[i]); - goto err_free_addr; - } } /* Configure data GPIOs. */ for (i = 0; i < gpio_ext->num_data; i++) { - err = gpio_request(gpio_ext->data[i], "GPIO extension data"); + err = gpio_request_one(gpio_ext->data[i], GPIOF_OUT_INIT_LOW, + "GPIO extension data"); if (err) goto err_free_data; - err = gpio_direction_output(gpio_ext->data[i], 0); - if (err) { - gpio_free(gpio_ext->data[i]); - goto err_free_data; - } } /* Configure "enable select" GPIO. */ - err = gpio_request(gpio_ext->enable, "GPIO extension enable"); + err = gpio_request_one(gpio_ext->enable, GPIOF_OUT_INIT_LOW, + "GPIO extension enable"); if (err) goto err_free_data; - err = gpio_direction_output(gpio_ext->enable, 0); - if (err) { - gpio_free(gpio_ext->enable); - goto err_free_data; - } return 0; From a6d511e5155406cd214d3af3ff9cffc69548b006 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 10 Jan 2012 15:09:40 -0800 Subject: [PATCH 075/124] leds: add driver for TCA6507 LED controller TI's TCA6507 is the LED driver in the GTA04 Openmoko motherboard. The driver provides full support for brightness levels and hardware blinking. This driver can drive each of 7 outputs as an LED or a GPIO output, and provides hardware-assist blinking. [akpm@linux-foundation.org: fix __mod_i2c_device_table alias] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: NeilBrown Cc: Richard Purdie Cc: Randy Dunlap Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/Kconfig | 8 + drivers/leds/Makefile | 1 + drivers/leds/leds-tca6507.c | 779 +++++++++++++++++++++++++++++++++++ include/linux/leds-tca6507.h | 34 ++ 4 files changed, 822 insertions(+) create mode 100644 drivers/leds/leds-tca6507.c create mode 100644 include/linux/leds-tca6507.h diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index 1b75a56ebd0801..897a77dfa9d7dd 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -388,6 +388,14 @@ config LEDS_RENESAS_TPU pin function. The latter to support brightness control. Brightness control is supported but hardware blinking is not. +config LEDS_TCA6507 + tristate "LED Support for TCA6507 I2C chip" + depends on LEDS_CLASS && I2C + help + This option enables support for LEDs connected to TC6507 + LED driver chips accessed via the I2C bus. + Driver support brightness control and hardware-assisted blinking. + config LEDS_TRIGGERS bool "LED Trigger support" depends on LEDS_CLASS diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index e4f6bf568880d2..5c9dc4b000d5d2 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_LEDS_GPIO) += leds-gpio.o obj-$(CONFIG_LEDS_LP3944) += leds-lp3944.o obj-$(CONFIG_LEDS_LP5521) += leds-lp5521.o obj-$(CONFIG_LEDS_LP5523) += leds-lp5523.o +obj-$(CONFIG_LEDS_TCA6507) += leds-tca6507.o obj-$(CONFIG_LEDS_CLEVO_MAIL) += leds-clevo-mail.o obj-$(CONFIG_LEDS_HP6XX) += leds-hp6xx.o obj-$(CONFIG_LEDS_FSG) += leds-fsg.o diff --git a/drivers/leds/leds-tca6507.c b/drivers/leds/leds-tca6507.c new file mode 100644 index 00000000000000..133f89fb7071c6 --- /dev/null +++ b/drivers/leds/leds-tca6507.c @@ -0,0 +1,779 @@ +/* + * leds-tca6507 + * + * The TCA6507 is a programmable LED controller that can drive 7 + * separate lines either by holding them low, or by pulsing them + * with modulated width. + * The modulation can be varied in a simple pattern to produce a blink or + * double-blink. + * + * This driver can configure each line either as a 'GPIO' which is out-only + * (no pull-up) or as an LED with variable brightness and hardware-assisted + * blinking. + * + * Apart from OFF and ON there are three programmable brightness levels which + * can be programmed from 0 to 15 and indicate how many 500usec intervals in + * each 8msec that the led is 'on'. The levels are named MASTER, BANK0 and + * BANK1. + * + * There are two different blink rates that can be programmed, each with + * separate time for rise, on, fall, off and second-off. Thus if 3 or more + * different non-trivial rates are required, software must be used for the extra + * rates. The two different blink rates must align with the two levels BANK0 and + * BANK1. + * This driver does not support double-blink so 'second-off' always matches + * 'off'. + * + * Only 16 different times can be programmed in a roughly logarithmic scale from + * 64ms to 16320ms. To be precise the possible times are: + * 0, 64, 128, 192, 256, 384, 512, 768, + * 1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320 + * + * Times that cannot be closely matched with these must be + * handled in software. This driver allows 12.5% error in matching. + * + * This driver does not allow rise/fall rates to be set explicitly. When trying + * to match a given 'on' or 'off' period, an appropriate pair of 'change' and + * 'hold' times are chosen to get a close match. If the target delay is even, + * the 'change' number will be the smaller; if odd, the 'hold' number will be + * the smaller. + + * Choosing pairs of delays with 12.5% errors allows us to match delays in the + * ranges: 56-72, 112-144, 168-216, 224-27504, 28560-36720. + * 26% of the achievable sums can be matched by multiple pairings. For example + * 1536 == 1536+0, 1024+512, or 768+768. This driver will always choose the + * pairing with the least maximum - 768+768 in this case. Other pairings are + * not available. + * + * Access to the 3 levels and 2 blinks are on a first-come, first-served basis. + * Access can be shared by multiple leds if they have the same level and + * either same blink rates, or some don't blink. + * When a led changes, it relinquishes access and tries again, so it might + * lose access to hardware blink. + * If a blink engine cannot be allocated, software blink is used. + * If the desired brightness cannot be allocated, the closest available non-zero + * brightness is used. As 'full' is always available, the worst case would be + * to have two different blink rates at '1', with Max at '2', then other leds + * will have to choose between '2' and '16'. Hopefully this is not likely. + * + * Each bank (BANK0 and BANK1) has two usage counts - LEDs using the brightness + * and LEDs using the blink. It can only be reprogrammed when the appropriate + * counter is zero. The MASTER level has a single usage count. + * + * Each Led has programmable 'on' and 'off' time as milliseconds. With each + * there is a flag saying if it was explicitly requested or defaulted. + * Similarly the banks know if each time was explicit or a default. Defaults + * are permitted to be changed freely - they are not recognised when matching. + * + * + * An led-tca6507 device must be provided with platform data. This data + * lists for each output: the name, default trigger, and whether the signal + * is being used as a GPiO rather than an led. 'struct led_plaform_data' + * is used for this. If 'name' is NULL, the output isn't used. If 'flags' + * is TCA6507_MAKE_CPIO, the output is a GPO. + * The "struct led_platform_data" can be embedded in a + * "struct tca6507_platform_data" which adds a 'gpio_base' for the GPiOs, + * and a 'setup' callback which is called once the GPiOs are available. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* LED select registers determine the source that drives LED outputs */ +#define TCA6507_LS_LED_OFF 0x0 /* Output HI-Z (off) */ +#define TCA6507_LS_LED_OFF1 0x1 /* Output HI-Z (off) - not used */ +#define TCA6507_LS_LED_PWM0 0x2 /* Output LOW with Bank0 rate */ +#define TCA6507_LS_LED_PWM1 0x3 /* Output LOW with Bank1 rate */ +#define TCA6507_LS_LED_ON 0x4 /* Output LOW (on) */ +#define TCA6507_LS_LED_MIR 0x5 /* Output LOW with Master Intensity */ +#define TCA6507_LS_BLINK0 0x6 /* Blink at Bank0 rate */ +#define TCA6507_LS_BLINK1 0x7 /* Blink at Bank1 rate */ + +enum { + BANK0, + BANK1, + MASTER, +}; +static int bank_source[3] = { + TCA6507_LS_LED_PWM0, + TCA6507_LS_LED_PWM1, + TCA6507_LS_LED_MIR, +}; +static int blink_source[2] = { + TCA6507_LS_BLINK0, + TCA6507_LS_BLINK1, +}; + +/* PWM registers */ +#define TCA6507_REG_CNT 11 + +/* + * 0x00, 0x01, 0x02 encode the TCA6507_LS_* values, each output + * owns one bit in each register + */ +#define TCA6507_FADE_ON 0x03 +#define TCA6507_FULL_ON 0x04 +#define TCA6507_FADE_OFF 0x05 +#define TCA6507_FIRST_OFF 0x06 +#define TCA6507_SECOND_OFF 0x07 +#define TCA6507_MAX_INTENSITY 0x08 +#define TCA6507_MASTER_INTENSITY 0x09 +#define TCA6507_INITIALIZE 0x0A + +#define INIT_CODE 0x8 + +#define TIMECODES 16 +static int time_codes[TIMECODES] = { + 0, 64, 128, 192, 256, 384, 512, 768, + 1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320 +}; + +/* Convert an led.brightness level (0..255) to a TCA6507 level (0..15) */ +static inline int TO_LEVEL(int brightness) +{ + return brightness >> 4; +} + +/* ...and convert back */ +static inline int TO_BRIGHT(int level) +{ + if (level) + return (level << 4) | 0xf; + return 0; +} + +#define NUM_LEDS 7 +struct tca6507_chip { + int reg_set; /* One bit per register where + * a '1' means the register + * should be written */ + u8 reg_file[TCA6507_REG_CNT]; + /* Bank 2 is Master Intensity and doesn't use times */ + struct bank { + int level; + int ontime, offtime; + int on_dflt, off_dflt; + int time_use, level_use; + } bank[3]; + struct i2c_client *client; + struct work_struct work; + spinlock_t lock; + + struct tca6507_led { + struct tca6507_chip *chip; + struct led_classdev led_cdev; + int num; + int ontime, offtime; + int on_dflt, off_dflt; + int bank; /* Bank used, or -1 */ + int blink; /* Set if hardware-blinking */ + } leds[NUM_LEDS]; +#ifdef CONFIG_GPIOLIB + struct gpio_chip gpio; + const char *gpio_name[NUM_LEDS]; + int gpio_map[NUM_LEDS]; +#endif +}; + +static const struct i2c_device_id tca6507_id[] = { + { "tca6507" }, + { } +}; +MODULE_DEVICE_TABLE(i2c, tca6507_id); + +static int choose_times(int msec, int *c1p, int *c2p) +{ + /* + * Choose two timecodes which add to 'msec' as near as possible. + * The first returned is the 'on' or 'off' time. The second is to be + * used as a 'fade-on' or 'fade-off' time. If 'msec' is even, + * the first will not be smaller than the second. If 'msec' is odd, + * the first will not be larger than the second. + * If we cannot get a sum within 1/8 of 'msec' fail with -EINVAL, + * otherwise return the sum that was achieved, plus 1 if the first is + * smaller. + * If two possibilities are equally good (e.g. 512+0, 256+256), choose + * the first pair so there is more change-time visible (i.e. it is + * softer). + */ + int c1, c2; + int tmax = msec * 9 / 8; + int tmin = msec * 7 / 8; + int diff = 65536; + + /* We start at '1' to ensure we never even think of choosing a + * total time of '0'. + */ + for (c1 = 1; c1 < TIMECODES; c1++) { + int t = time_codes[c1]; + if (t*2 < tmin) + continue; + if (t > tmax) + break; + for (c2 = 0; c2 <= c1; c2++) { + int tt = t + time_codes[c2]; + int d; + if (tt < tmin) + continue; + if (tt > tmax) + break; + /* This works! */ + d = abs(msec - tt); + if (d >= diff) + continue; + /* Best yet */ + *c1p = c1; + *c2p = c2; + diff = d; + if (d == 0) + return msec; + } + } + if (diff < 65536) { + int actual; + if (msec & 1) { + c1 = *c2p; + *c2p = *c1p; + *c1p = c1; + } + actual = time_codes[*c1p] + time_codes[*c2p]; + if (*c1p < *c2p) + return actual + 1; + else + return actual; + } + /* No close match */ + return -EINVAL; +} + +/* + * Update the register file with the appropriate 3-bit state for + * the given led. + */ +static void set_select(struct tca6507_chip *tca, int led, int val) +{ + int mask = (1 << led); + int bit; + + for (bit = 0; bit < 3; bit++) { + int n = tca->reg_file[bit] & ~mask; + if (val & (1 << bit)) + n |= mask; + if (tca->reg_file[bit] != n) { + tca->reg_file[bit] = n; + tca->reg_set |= (1 << bit); + } + } +} + +/* Update the register file with the appropriate 4-bit code for + * one bank or other. This can be used for timers, for levels, or + * for initialisation. + */ +static void set_code(struct tca6507_chip *tca, int reg, int bank, int new) +{ + int mask = 0xF; + int n; + if (bank) { + mask <<= 4; + new <<= 4; + } + n = tca->reg_file[reg] & ~mask; + n |= new; + if (tca->reg_file[reg] != n) { + tca->reg_file[reg] = n; + tca->reg_set |= 1 << reg; + } +} + +/* Update brightness level. */ +static void set_level(struct tca6507_chip *tca, int bank, int level) +{ + switch (bank) { + case BANK0: + case BANK1: + set_code(tca, TCA6507_MAX_INTENSITY, bank, level); + break; + case MASTER: + set_code(tca, TCA6507_MASTER_INTENSITY, 0, level); + break; + } + tca->bank[bank].level = level; +} + +/* Record all relevant time code for a given bank */ +static void set_times(struct tca6507_chip *tca, int bank) +{ + int c1, c2; + int result; + + result = choose_times(tca->bank[bank].ontime, &c1, &c2); + dev_dbg(&tca->client->dev, + "Chose on times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1], + c2, time_codes[c2], tca->bank[bank].ontime); + set_code(tca, TCA6507_FADE_ON, bank, c2); + set_code(tca, TCA6507_FULL_ON, bank, c1); + tca->bank[bank].ontime = result; + + result = choose_times(tca->bank[bank].offtime, &c1, &c2); + dev_dbg(&tca->client->dev, + "Chose off times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1], + c2, time_codes[c2], tca->bank[bank].offtime); + set_code(tca, TCA6507_FADE_OFF, bank, c2); + set_code(tca, TCA6507_FIRST_OFF, bank, c1); + set_code(tca, TCA6507_SECOND_OFF, bank, c1); + tca->bank[bank].offtime = result; + + set_code(tca, TCA6507_INITIALIZE, bank, INIT_CODE); +} + +/* Write all needed register of tca6507 */ + +static void tca6507_work(struct work_struct *work) +{ + struct tca6507_chip *tca = container_of(work, struct tca6507_chip, + work); + struct i2c_client *cl = tca->client; + int set; + u8 file[TCA6507_REG_CNT]; + int r; + + spin_lock_irq(&tca->lock); + set = tca->reg_set; + memcpy(file, tca->reg_file, TCA6507_REG_CNT); + tca->reg_set = 0; + spin_unlock_irq(&tca->lock); + + for (r = 0; r < TCA6507_REG_CNT; r++) + if (set & (1<chip; + if (led->bank >= 0) { + struct bank *b = tca->bank + led->bank; + if (led->blink) + b->time_use--; + b->level_use--; + } + led->blink = 0; + led->bank = -1; +} + +static int led_prepare(struct tca6507_led *led) +{ + /* Assign this led to a bank, configuring that bank if necessary. */ + int level = TO_LEVEL(led->led_cdev.brightness); + struct tca6507_chip *tca = led->chip; + int c1, c2; + int i; + struct bank *b; + int need_init = 0; + + led->led_cdev.brightness = TO_BRIGHT(level); + if (level == 0) { + set_select(tca, led->num, TCA6507_LS_LED_OFF); + return 0; + } + + if (led->ontime == 0 || led->offtime == 0) { + /* + * Just set the brightness, choosing first usable bank. + * If none perfect, choose best. + * Count backwards so we check MASTER bank first + * to avoid wasting a timer. + */ + int best = -1;/* full-on */ + int diff = 15-level; + + if (level == 15) { + set_select(tca, led->num, TCA6507_LS_LED_ON); + return 0; + } + + for (i = MASTER; i >= BANK0; i--) { + int d; + if (tca->bank[i].level == level || + tca->bank[i].level_use == 0) { + best = i; + break; + } + d = abs(level - tca->bank[i].level); + if (d < diff) { + diff = d; + best = i; + } + } + if (best == -1) { + /* Best brightness is full-on */ + set_select(tca, led->num, TCA6507_LS_LED_ON); + led->led_cdev.brightness = LED_FULL; + return 0; + } + + if (!tca->bank[best].level_use) + set_level(tca, best, level); + + tca->bank[best].level_use++; + led->bank = best; + set_select(tca, led->num, bank_source[best]); + led->led_cdev.brightness = TO_BRIGHT(tca->bank[best].level); + return 0; + } + + /* + * We have on/off time so we need to try to allocate a timing bank. + * First check if times are compatible with hardware and give up if + * not. + */ + if (choose_times(led->ontime, &c1, &c2) < 0) + return -EINVAL; + if (choose_times(led->offtime, &c1, &c2) < 0) + return -EINVAL; + + for (i = BANK0; i <= BANK1; i++) { + if (tca->bank[i].level_use == 0) + /* not in use - it is ours! */ + break; + if (tca->bank[i].level != level) + /* Incompatible level - skip */ + /* FIX: if timer matches we maybe should consider + * this anyway... + */ + continue; + + if (tca->bank[i].time_use == 0) + /* Timer not in use, and level matches - use it */ + break; + + if (!(tca->bank[i].on_dflt || + led->on_dflt || + tca->bank[i].ontime == led->ontime)) + /* on time is incompatible */ + continue; + + if (!(tca->bank[i].off_dflt || + led->off_dflt || + tca->bank[i].offtime == led->offtime)) + /* off time is incompatible */ + continue; + + /* looks like a suitable match */ + break; + } + + if (i > BANK1) + /* Nothing matches - how sad */ + return -EINVAL; + + b = &tca->bank[i]; + if (b->level_use == 0) + set_level(tca, i, level); + b->level_use++; + led->bank = i; + + if (b->on_dflt || + !led->on_dflt || + b->time_use == 0) { + b->ontime = led->ontime; + b->on_dflt = led->on_dflt; + need_init = 1; + } + + if (b->off_dflt || + !led->off_dflt || + b->time_use == 0) { + b->offtime = led->offtime; + b->off_dflt = led->off_dflt; + need_init = 1; + } + + if (need_init) + set_times(tca, i); + + led->ontime = b->ontime; + led->offtime = b->offtime; + + b->time_use++; + led->blink = 1; + led->led_cdev.brightness = TO_BRIGHT(b->level); + set_select(tca, led->num, blink_source[i]); + return 0; +} + +static int led_assign(struct tca6507_led *led) +{ + struct tca6507_chip *tca = led->chip; + int err; + unsigned long flags; + + spin_lock_irqsave(&tca->lock, flags); + led_release(led); + err = led_prepare(led); + if (err) { + /* + * Can only fail on timer setup. In that case we need to + * re-establish as steady level. + */ + led->ontime = 0; + led->offtime = 0; + led_prepare(led); + } + spin_unlock_irqrestore(&tca->lock, flags); + + if (tca->reg_set) + schedule_work(&tca->work); + return err; +} + +static void tca6507_brightness_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct tca6507_led *led = container_of(led_cdev, struct tca6507_led, + led_cdev); + led->led_cdev.brightness = brightness; + led->ontime = 0; + led->offtime = 0; + led_assign(led); +} + +static int tca6507_blink_set(struct led_classdev *led_cdev, + unsigned long *delay_on, + unsigned long *delay_off) +{ + struct tca6507_led *led = container_of(led_cdev, struct tca6507_led, + led_cdev); + + if (*delay_on == 0) + led->on_dflt = 1; + else if (delay_on != &led_cdev->blink_delay_on) + led->on_dflt = 0; + led->ontime = *delay_on; + + if (*delay_off == 0) + led->off_dflt = 1; + else if (delay_off != &led_cdev->blink_delay_off) + led->off_dflt = 0; + led->offtime = *delay_off; + + if (led->ontime == 0) + led->ontime = 512; + if (led->offtime == 0) + led->offtime = 512; + + if (led->led_cdev.brightness == LED_OFF) + led->led_cdev.brightness = LED_FULL; + if (led_assign(led) < 0) { + led->ontime = 0; + led->offtime = 0; + led->led_cdev.brightness = LED_OFF; + return -EINVAL; + } + *delay_on = led->ontime; + *delay_off = led->offtime; + return 0; +} + +#ifdef CONFIG_GPIOLIB +static void tca6507_gpio_set_value(struct gpio_chip *gc, + unsigned offset, int val) +{ + struct tca6507_chip *tca = container_of(gc, struct tca6507_chip, gpio); + unsigned long flags; + + spin_lock_irqsave(&tca->lock, flags); + /* + * 'OFF' is floating high, and 'ON' is pulled down, so it has the + * inverse sense of 'val'. + */ + set_select(tca, tca->gpio_map[offset], + val ? TCA6507_LS_LED_OFF : TCA6507_LS_LED_ON); + spin_unlock_irqrestore(&tca->lock, flags); + if (tca->reg_set) + schedule_work(&tca->work); +} + +static int tca6507_gpio_direction_output(struct gpio_chip *gc, + unsigned offset, int val) +{ + tca6507_gpio_set_value(gc, offset, val); + return 0; +} + +static int tca6507_probe_gpios(struct i2c_client *client, + struct tca6507_chip *tca, + struct tca6507_platform_data *pdata) +{ + int err; + int i = 0; + int gpios = 0; + + for (i = 0; i < NUM_LEDS; i++) + if (pdata->leds.leds[i].name && pdata->leds.leds[i].flags) { + /* Configure as a gpio */ + tca->gpio_name[gpios] = pdata->leds.leds[i].name; + tca->gpio_map[gpios] = i; + gpios++; + } + + if (!gpios) + return 0; + + tca->gpio.label = "gpio-tca6507"; + tca->gpio.names = tca->gpio_name; + tca->gpio.ngpio = gpios; + tca->gpio.base = pdata->gpio_base; + tca->gpio.owner = THIS_MODULE; + tca->gpio.direction_output = tca6507_gpio_direction_output; + tca->gpio.set = tca6507_gpio_set_value; + tca->gpio.dev = &client->dev; + err = gpiochip_add(&tca->gpio); + if (err) { + tca->gpio.ngpio = 0; + return err; + } + if (pdata->setup) + pdata->setup(tca->gpio.base, tca->gpio.ngpio); + return 0; +} + +static void tca6507_remove_gpio(struct tca6507_chip *tca) +{ + if (tca->gpio.ngpio) { + int err = gpiochip_remove(&tca->gpio); + dev_err(&tca->client->dev, "%s failed, %d\n", + "gpiochip_remove()", err); + } +} +#else /* CONFIG_GPIOLIB */ +static int tca6507_probe_gpios(struct i2c_client *client, + struct tca6507_chip *tca, + struct tca6507_platform_data *pdata) +{ + return 0; +} +static void tca6507_remove_gpio(struct tca6507_chip *tca) +{ +} +#endif /* CONFIG_GPIOLIB */ + +static int __devinit tca6507_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct tca6507_chip *tca; + struct i2c_adapter *adapter; + struct tca6507_platform_data *pdata; + int err; + int i = 0; + + adapter = to_i2c_adapter(client->dev.parent); + pdata = client->dev.platform_data; + + if (!i2c_check_functionality(adapter, I2C_FUNC_I2C)) + return -EIO; + + if (!pdata || pdata->leds.num_leds != NUM_LEDS) { + dev_err(&client->dev, "Need %d entries in platform-data list\n", + NUM_LEDS); + return -ENODEV; + } + err = -ENOMEM; + tca = kzalloc(sizeof(*tca), GFP_KERNEL); + if (!tca) + goto exit; + + tca->client = client; + INIT_WORK(&tca->work, tca6507_work); + spin_lock_init(&tca->lock); + i2c_set_clientdata(client, tca); + + for (i = 0; i < NUM_LEDS; i++) { + struct tca6507_led *l = tca->leds + i; + + l->chip = tca; + l->num = i; + if (pdata->leds.leds[i].name && !pdata->leds.leds[i].flags) { + l->led_cdev.name = pdata->leds.leds[i].name; + l->led_cdev.default_trigger + = pdata->leds.leds[i].default_trigger; + l->led_cdev.brightness_set = tca6507_brightness_set; + l->led_cdev.blink_set = tca6507_blink_set; + l->bank = -1; + err = led_classdev_register(&client->dev, + &l->led_cdev); + if (err < 0) + goto exit; + } + } + err = tca6507_probe_gpios(client, tca, pdata); + if (err) + goto exit; + /* set all registers to known state - zero */ + tca->reg_set = 0x7f; + schedule_work(&tca->work); + + return 0; +exit: + while (i--) + if (tca->leds[i].led_cdev.name) + led_classdev_unregister(&tca->leds[i].led_cdev); + cancel_work_sync(&tca->work); + i2c_set_clientdata(client, NULL); + kfree(tca); + return err; +} + +static int __devexit tca6507_remove(struct i2c_client *client) +{ + int i; + struct tca6507_chip *tca = i2c_get_clientdata(client); + struct tca6507_led *tca_leds = tca->leds; + + for (i = 0; i < NUM_LEDS; i++) { + if (tca_leds[i].led_cdev.name) + led_classdev_unregister(&tca_leds[i].led_cdev); + } + tca6507_remove_gpio(tca); + cancel_work_sync(&tca->work); + i2c_set_clientdata(client, NULL); + kfree(tca); + + return 0; +} + +static struct i2c_driver tca6507_driver = { + .driver = { + .name = "leds-tca6507", + .owner = THIS_MODULE, + }, + .probe = tca6507_probe, + .remove = __devexit_p(tca6507_remove), + .id_table = tca6507_id, +}; + +static int __init tca6507_leds_init(void) +{ + return i2c_add_driver(&tca6507_driver); +} + +static void __exit tca6507_leds_exit(void) +{ + i2c_del_driver(&tca6507_driver); +} + +module_init(tca6507_leds_init); +module_exit(tca6507_leds_exit); + +MODULE_AUTHOR("NeilBrown "); +MODULE_DESCRIPTION("TCA6507 LED/GPO driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/leds-tca6507.h b/include/linux/leds-tca6507.h new file mode 100644 index 00000000000000..dcabf4fa2aef40 --- /dev/null +++ b/include/linux/leds-tca6507.h @@ -0,0 +1,34 @@ +/* + * TCA6507 LED chip driver. + * + * Copyright (C) 2011 Neil Brown + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA + * 02110-1301 USA + */ + +#ifndef __LINUX_TCA6507_H +#define __LINUX_TCA6507_H +#include + +struct tca6507_platform_data { + struct led_platform_data leds; +#ifdef CONFIG_GPIOLIB + int gpio_base; + void (*setup)(unsigned gpio_base, unsigned ngpio); +#endif +}; + +#define TCA6507_MAKE_GPIO 1 +#endif /* __LINUX_TCA6507_H*/ From 3b080945aa7670354364c8f9e1a3a07cbb97beb3 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:09:43 -0800 Subject: [PATCH 076/124] drivers/leds/leds-mc13783.c: fix off-by-one for checking num_leds The LED id begins from 0. Thus the maximum number of leds should be MC13783_LED_MAX + 1. Signed-off-by: Axel Lin Acked-by: Philippe Retornaz Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-mc13783.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/leds/leds-mc13783.c b/drivers/leds/leds-mc13783.c index c61e8c4f54694d..8bc4915415509d 100644 --- a/drivers/leds/leds-mc13783.c +++ b/drivers/leds/leds-mc13783.c @@ -275,7 +275,7 @@ static int __devinit mc13783_led_probe(struct platform_device *pdev) return -ENODEV; } - if (pdata->num_leds < 1 || pdata->num_leds > MC13783_LED_MAX) { + if (pdata->num_leds < 1 || pdata->num_leds > (MC13783_LED_MAX + 1)) { dev_err(&pdev->dev, "Invalid led count %d\n", pdata->num_leds); return -EINVAL; } From 1713cb9d6069fac581fcea928f65ca6ca7c9facf Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:45 -0800 Subject: [PATCH 077/124] leds: convert wm831x status driver to devm_kzalloc() Saves a small amount of code and systematically eliminates leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-wm831x-status.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/leds/leds-wm831x-status.c b/drivers/leds/leds-wm831x-status.c index 444a68d8e17e1f..74a24cf897c386 100644 --- a/drivers/leds/leds-wm831x-status.c +++ b/drivers/leds/leds-wm831x-status.c @@ -237,7 +237,8 @@ static int wm831x_status_probe(struct platform_device *pdev) goto err; } - drvdata = kzalloc(sizeof(struct wm831x_status), GFP_KERNEL); + drvdata = devm_kzalloc(&pdev->dev, sizeof(struct wm831x_status), + GFP_KERNEL); if (!drvdata) return -ENOMEM; dev_set_drvdata(&pdev->dev, drvdata); @@ -300,7 +301,6 @@ static int wm831x_status_probe(struct platform_device *pdev) err_led: led_classdev_unregister(&drvdata->cdev); - kfree(drvdata); err: return ret; } @@ -311,7 +311,6 @@ static int wm831x_status_remove(struct platform_device *pdev) device_remove_file(drvdata->cdev.dev, &dev_attr_src); led_classdev_unregister(&drvdata->cdev); - kfree(drvdata); return 0; } From c957b614bde8539416dcde8d702370ff30b1c662 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:09:46 -0800 Subject: [PATCH 078/124] leds: convert wm8350 driver to devm_kzalloc() Saves a small amount of code and systematically eliminates leaks. Signed-off-by: Mark Brown Cc: Richard Purdie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/leds/leds-wm8350.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c index 390c0f679628f7..918d4baff1c725 100644 --- a/drivers/leds/leds-wm8350.c +++ b/drivers/leds/leds-wm8350.c @@ -227,7 +227,7 @@ static int wm8350_led_probe(struct platform_device *pdev) goto err_isink; } - led = kzalloc(sizeof(*led), GFP_KERNEL); + led = devm_kzalloc(&pdev->dev, sizeof(*led), GFP_KERNEL); if (led == NULL) { ret = -ENOMEM; goto err_dcdc; @@ -259,12 +259,10 @@ static int wm8350_led_probe(struct platform_device *pdev) ret = led_classdev_register(&pdev->dev, &led->cdev); if (ret < 0) - goto err_led; + goto err_dcdc; return 0; - err_led: - kfree(led); err_dcdc: regulator_put(dcdc); err_isink: @@ -281,7 +279,6 @@ static int wm8350_led_remove(struct platform_device *pdev) wm8350_led_disable(led); regulator_put(led->dcdc); regulator_put(led->isink); - kfree(led); return 0; } From 96b62067f970ff529c98913311d33f4b57b453dc Mon Sep 17 00:00:00 2001 From: Steve Hodgson Date: Tue, 10 Jan 2012 15:09:47 -0800 Subject: [PATCH 079/124] btree: export btree_get_prev() so modules can use btree_for_each The btree_for_each API is implemented with macros that internally call btree_get_prev(), so if btree_get_prev() isn't exported then modules fail to link if they try to use one of the btree_for_each macros. Since the rest of the btree API is exported, we should keep things orthogonal and make this work too. Signed-off-by: Roland Dreier Signed-off-by: Steve Hodgson Acked-by: Joern Engel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/btree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/btree.c b/lib/btree.c index 2a34392bcecc36..e5ec1e9c1aa52c 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -357,6 +357,7 @@ void *btree_get_prev(struct btree_head *head, struct btree_geo *geo, } return NULL; } +EXPORT_SYMBOL_GPL(btree_get_prev); static int getpos(struct btree_geo *geo, unsigned long *node, unsigned long *key) From 270c49a088ae58d4b817861bb04bfec63b0966db Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:50 -0800 Subject: [PATCH 080/124] checkpatch: update signature "might be better as" warning email header lines can look like signature tags. It's valid to have multiple email recipients on a single line but not valid to have multiple signatures on a single line. Validate signatures only when not in the email headers. Clear the $in_commit_log flag when the patch filename appears. Add '-' to the valid chars in a message header for headers like "Message-Id:" and "In-Reply-To:". Signed-off-by: Joe Perches Reported-by: Julia Lawall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8fda3b3f7be87c..885e3b43d64af2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1504,9 +1504,11 @@ sub process { if ($line =~ /^diff --git.*?(\S+)$/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; + $in_commit_log = 0; } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@; + $in_commit_log = 0; $p1_prefix = $1; if (!$file && $tree && $p1_prefix ne '' && @@ -1546,7 +1548,8 @@ sub process { } # Check signature styles - if ($line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { + if (!$in_header_lines && + $line =~ /^(\s*)($signature_tags)(\s*)(.*)/) { my $space_before = $1; my $sign_off = $2; my $space_after = $3; @@ -1623,7 +1626,7 @@ sub process { # Check if it's the start of a commit log # (not a header line and we haven't seen the patch filename) if ($in_header_lines && $realfile =~ /^$/ && - $rawline !~ /^(commit\b|from\b|\w+:).+$/i) { + $rawline !~ /^(commit\b|from\b|[\w-]+:).+$/i) { $in_header_lines = 0; $in_commit_log = 1; } From 5f14d3bd87ef5f979ea64c1f0862534d71786db7 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:52 -0800 Subject: [PATCH 081/124] checkpatch: prefer __printf over __attribute__((format(printf,...))) Add a warn for not using __printf. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 885e3b43d64af2..e94626ca77cf23 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3114,6 +3114,12 @@ sub process { "__aligned(size) is preferred over __attribute__((aligned(size)))\n" . $herecurr); } +# Check for __attribute__ format(printf, prefer __printf + if ($line =~ /\b__attribute__\s*\(\s*\(\s*format\s*\(\s*printf/) { + WARN("PREFER_PRINTF", + "__printf(string-index, first-to-check) is preferred over __attribute__((format(printf, string-index, first-to-check)))\n" . $herecurr); + } + # check for sizeof(&) if ($line =~ /\bsizeof\s*\(\s*\&/) { WARN("SIZEOF_ADDRESS", From f74bd1942e04a0cedd1e9c8b331141e75add49c0 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:09:54 -0800 Subject: [PATCH 082/124] checkpatch: correctly track the end of preprocessor commands in context When looking for a statement we currently run on through preprocessor commands. This means that a header file with just definitions is parsed over and over again combining all of the lines from the current line to the end of file leading to severe performance issues. Fix up context accumulation to track preprocessor commands and stop when reaching the end of them. At the same time vastly simplify the #define handling. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 90 +++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 51 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e94626ca77cf23..06e22caa5692d8 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -676,6 +676,10 @@ sub ctx_statement_block { if ($off >= $len) { last; } + if ($level == 0 && substr($blk, $off) =~ /^.\s*#\s*define/) { + $level++; + $type = '#'; + } } $p = $c; $c = substr($blk, $off, 1); @@ -738,6 +742,13 @@ sub ctx_statement_block { last; } } + # Preprocessor commands end at the newline unless escaped. + if ($type eq '#' && $c eq "\n" && $p ne "\\") { + $level--; + $type = ''; + $off++; + last; + } $off++; } # We are truly at the end, so shuffle to the next line. @@ -1801,6 +1812,8 @@ sub process { $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; +#print "stat<$stat>\n"; + # Find the real next line. $realline_next = $line_nr_next; if (defined $realline_next && @@ -2781,47 +2794,13 @@ sub process { my $cnt = $realcnt; my ($off, $dstat, $dcond, $rest); my $ctx = ''; - - my $args = defined($1); - - # Find the end of the macro and limit our statement - # search to that. - while ($cnt > 0 && defined $lines[$ln - 1] && - $lines[$ln - 1] =~ /^(?:-|..*\\$)/) - { - $ctx .= $rawlines[$ln - 1] . "\n"; - $cnt-- if ($lines[$ln - 1] !~ /^-/); - $ln++; - } - $ctx .= $rawlines[$ln - 1]; - ($dstat, $dcond, $ln, $cnt, $off) = - ctx_statement_block($linenr, $ln - $linenr + 1, 0); + ctx_statement_block($linenr, $realcnt, 0); + $ctx = $dstat; #print "dstat<$dstat> dcond<$dcond> cnt<$cnt> off<$off>\n"; #print "LINE<$lines[$ln-1]> len<" . length($lines[$ln-1]) . "\n"; - # Extract the remainder of the define (if any) and - # rip off surrounding spaces, and trailing \'s. - $rest = ''; - while ($off != 0 || ($cnt > 0 && $rest =~ /\\\s*$/)) { - #print "ADDING cnt<$cnt> $off <" . substr($lines[$ln - 1], $off) . "> rest<$rest>\n"; - if ($off != 0 || $lines[$ln - 1] !~ /^-/) { - $rest .= substr($lines[$ln - 1], $off) . "\n"; - $cnt--; - } - $ln++; - $off = 0; - } - $rest =~ s/\\\n.//g; - $rest =~ s/^\s*//s; - $rest =~ s/\s*$//s; - - # Clean up the original statement. - if ($args) { - substr($dstat, 0, length($dcond), ''); - } else { - $dstat =~ s/^.\s*\#\s*define\s+$Ident\s*//; - } + $dstat =~ s/^.\s*\#\s*define\s+$Ident(?:\([^\)]*\))?\s*//; $dstat =~ s/$;//g; $dstat =~ s/\\\n.//g; $dstat =~ s/^\s*//s; @@ -2847,23 +2826,32 @@ sub process { ^\"|\"$ }x; #print "REST<$rest> dstat<$dstat> ctx<$ctx>\n"; - if ($rest ne '' && $rest ne ',') { - if ($rest !~ /while\s*\(/ && - $dstat !~ /$exceptions/) - { - ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", - "Macros with multiple statements should be enclosed in a do - while loop\n" . "$here\n$ctx\n"); + if ($dstat ne '' && + $dstat !~ /^(?:$Ident|-?$Constant),$/ && # 10, // foo(), + $dstat !~ /^(?:$Ident|-?$Constant);$/ && # foo(); + $dstat !~ /^(?:$Ident|-?$Constant)$/ && # 10 // foo() + $dstat !~ /$exceptions/ && + $dstat !~ /^\.$Ident\s*=/ && # .foo = + $dstat !~ /^do\s*$Constant\s*while\s*$Constant;$/ && # do {...} while (...); + $dstat !~ /^for\s*$Constant$/ && # for (...) + $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() + $dstat !~ /^do\s*{/ && # do {... + $dstat !~ /^\({/) # ({... + { + $ctx =~ s/\n*$//; + my $herectx = $here . "\n"; + my $cnt = statement_rawlines($ctx); + + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; } - } elsif ($ctx !~ /;/) { - if ($dstat ne '' && - $dstat !~ /^(?:$Ident|-?$Constant)$/ && - $dstat !~ /$exceptions/ && - $dstat !~ /^\.$Ident\s*=/ && - $dstat =~ /$Operators/) - { + if ($dstat =~ /;/) { + ERROR("MULTISTATEMENT_MACRO_USE_DO_WHILE", + "Macros with multiple statements should be enclosed in a do - while loop\n" . "$herectx"); + } else { ERROR("COMPLEX_MACRO", - "Macros with complex values should be enclosed in parenthesis\n" . "$here\n$ctx\n"); + "Macros with complex values should be enclosed in parenthesis\n" . "$herectx"); } } } From 554e165cf32610ec9596a183fa1b54a5707fc3cb Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:09:57 -0800 Subject: [PATCH 083/124] checkpatch: check for common memset parameter issues against statments Move the memset checks over to work against the statement. Also add checks for 0 and 1 used as lengths. Generally these indicate badly ordered parameters. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 06e22caa5692d8..8199d59d0ad745 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3120,6 +3120,28 @@ sub process { "Avoid line continuations in quoted strings\n" . $herecurr); } +# Check for misused memsets + if (defined $stat && $stat =~ /\bmemset\s*\((.*)\)/s) { + my $args = $1; + + # Flatten any parentheses and braces + while ($args =~ s/\([^\(\)]*\)/10/s || + $args =~ s/\{[^\{\}]*\}/10/s || + $args =~ s/\[[^\[\]]*\]/10/s) + { + } + # Extract the simplified arguments. + my ($ms_addr, $ms_val, $ms_size) = + split(/\s*,\s*/, $args); + if ($ms_size =~ /^(0x|)0$/i) { + ERROR("MEMSET", + "memset size is 3rd argument, not the second.\n" . $herecurr); + } elsif ($ms_size =~ /^(0x|)1$/i) { + WARN("MEMSET", + "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . $herecurr); + } + } + # check for new externs in .c files. if ($realfile =~ /\.c$/ && defined $stat && $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s) @@ -3291,12 +3313,6 @@ sub process { WARN("EXPORTED_WORLD_WRITABLE", "Exporting world writable files is usually an error. Consider more restrictive permissions.\n" . $herecurr); } - - # Check for memset with swapped arguments - if ($line =~ /memset.*\,(\ |)(0x|)0(\ |0|)\);/) { - ERROR("MEMSET", - "memset size is 3rd argument, not the second.\n" . $herecurr); - } } # If we have no input at all, then there is nothing to report on From d7c76ba7e58bc3ca674f20759c686535db484749 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 10 Jan 2012 15:09:58 -0800 Subject: [PATCH 084/124] checkpatch: improve memset and min/max with cast checking Improve the checking of arguments to memset and min/max tests. Move the checking of min/max to statement blocks instead of single line. Change $Constant to allow any case type 0x initiator and trailing ul specifier. Add $FuncArg type as any function argument with or without a cast. Print the whole statement when showing memset or min/max messages. Improve the memset with 0 as 3rd argument error message. There are still weaknesses in the $FuncArg and $Constant code as arbitrary parentheses and negative signs are not generically supported. [akpm@linux-foundation.org: fix per Andy] Signed-off-by: Joe Perches Acked-by: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 69 +++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 36 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 8199d59d0ad745..4c53d6f67339bc 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -227,7 +227,7 @@ sub help { our $Member = qr{->$Ident|\.$Ident|\[[^]]*\]}; our $Lval = qr{$Ident(?:$Member)*}; -our $Constant = qr{(?:[0-9]+|0x[0-9a-fA-F]+)[UL]*}; +our $Constant = qr{(?i:(?:[0-9]+|0x[0-9a-f]+)[ul]*)}; our $Assignment = qr{(?:\*\=|/=|%=|\+=|-=|<<=|>>=|&=|\^=|\|=|=)}; our $Compare = qr{<=|>=|==|!=|<|>}; our $Operators = qr{ @@ -334,6 +334,7 @@ sub build_types { our $Typecast = qr{\s*(\(\s*$NonptrType\s*\)){0,1}\s*}; our $LvalOrFunc = qr{($Lval)\s*($match_balanced_parentheses{0,1})\s*}; +our $FuncArg = qr{$Typecast{0,1}($LvalOrFunc|$Constant)}; sub deparenthesize { my ($string) = @_; @@ -2609,28 +2610,6 @@ sub process { } } -# typecasts on min/max could be min_t/max_t - if ($line =~ /^\+(?:.*?)\b(min|max)\s*\($Typecast{0,1}($LvalOrFunc)\s*,\s*$Typecast{0,1}($LvalOrFunc)\s*\)/) { - if (defined $2 || defined $8) { - my $call = $1; - my $cast1 = deparenthesize($2); - my $arg1 = $3; - my $cast2 = deparenthesize($8); - my $arg2 = $9; - my $cast; - - if ($cast1 ne "" && $cast2 ne "") { - $cast = "$cast1 or $cast2"; - } elsif ($cast1 ne "") { - $cast = $cast1; - } else { - $cast = $cast2; - } - WARN("MINMAX", - "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . $herecurr); - } - } - # Need a space before open parenthesis after if, while etc if ($line=~/\b(if|while|for|switch)\(/) { ERROR("SPACING", "space required before the open parenthesis '('\n" . $herecurr); @@ -3121,24 +3100,42 @@ sub process { } # Check for misused memsets - if (defined $stat && $stat =~ /\bmemset\s*\((.*)\)/s) { - my $args = $1; + if (defined $stat && + $stat =~ /^\+(?:.*?)\bmemset\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\,\s*$FuncArg\s*\)/s) { + + my $ms_addr = $2; + my $ms_val = $8; + my $ms_size = $14; - # Flatten any parentheses and braces - while ($args =~ s/\([^\(\)]*\)/10/s || - $args =~ s/\{[^\{\}]*\}/10/s || - $args =~ s/\[[^\[\]]*\]/10/s) - { - } - # Extract the simplified arguments. - my ($ms_addr, $ms_val, $ms_size) = - split(/\s*,\s*/, $args); if ($ms_size =~ /^(0x|)0$/i) { ERROR("MEMSET", - "memset size is 3rd argument, not the second.\n" . $herecurr); + "memset to 0's uses 0 as the 2nd argument, not the 3rd\n" . "$here\n$stat\n"); } elsif ($ms_size =~ /^(0x|)1$/i) { WARN("MEMSET", - "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . $herecurr); + "single byte memset is suspicious. Swapped 2nd/3rd argument?\n" . "$here\n$stat\n"); + } + } + +# typecasts on min/max could be min_t/max_t + if (defined $stat && + $stat =~ /^\+(?:.*?)\b(min|max)\s*\(\s*$FuncArg\s*,\s*$FuncArg\s*\)/) { + if (defined $2 || defined $8) { + my $call = $1; + my $cast1 = deparenthesize($2); + my $arg1 = $3; + my $cast2 = deparenthesize($8); + my $arg2 = $9; + my $cast; + + if ($cast1 ne "" && $cast2 ne "") { + $cast = "$cast1 or $cast2"; + } elsif ($cast1 ne "") { + $cast = $cast1; + } else { + $cast = $cast2; + } + WARN("MINMAX", + "$call() should probably be ${call}_t($cast, $arg1, $arg2)\n" . "$here\n$stat\n"); } } From 89a883530fe79939384a6c6ed893c719762c7c9c Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:00 -0800 Subject: [PATCH 085/124] checkpatch: ## is not a valid modifier Inserting a # into the modifiers list will incorrectly add the null string to the modifiers list, leading to an infinite loop. As neither of these is a valid modifier form simply ignore them. Signed-off-by: Andy Whitcroft Reported-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 4c53d6f67339bc..b4390cf818da63 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1224,7 +1224,9 @@ sub possible { case| else| asm|__asm__| - do + do| + \#| + \#\#| )(?:\s|$)| ^(?:typedef|struct|enum)\b )}x; From 3e469cdc08ac5d84b220f8fb76a090d158d5114f Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:01 -0800 Subject: [PATCH 086/124] checkpatch: optimise statement scanner when mid-statement In the middle of a long definition or similar, there is no possibility of finding a smaller sub-statement. Optimise this case by skipping statement aquirey where there are no starts of statement (open brace '{' or semi-colon ';'). We are likely to scan slightly more than needed still but this is safest. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b4390cf818da63..618c0b5db0beed 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1373,6 +1373,7 @@ sub process { my %suppress_ifbraces; my %suppress_whiletrailers; my %suppress_export; + my $suppress_statement = 0; # Pre-scan the patch sanitizing the lines. # Pre-scan the patch looking for any __setup documentation. @@ -1482,6 +1483,7 @@ sub process { %suppress_ifbraces = (); %suppress_whiletrailers = (); %suppress_export = (); + $suppress_statement = 0; next; # track the line number as we move through the hunk, note that @@ -1809,13 +1811,23 @@ sub process { # Check for potential 'bare' types my ($stat, $cond, $line_nr_next, $remain_next, $off_next, $realline_next); - if ($realcnt && $line =~ /.\s*\S/) { +#print "LINE<$line>\n"; + if ($linenr >= $suppress_statement && + $realcnt && $line =~ /.\s*\S/) { ($stat, $cond, $line_nr_next, $remain_next, $off_next) = ctx_statement_block($linenr, $realcnt, 0); $stat =~ s/\n./\n /g; $cond =~ s/\n./\n /g; -#print "stat<$stat>\n"; +#print "linenr<$linenr> <$stat>\n"; + # If this statement has no statement boundaries within + # it there is no point in retrying a statement scan + # until we hit end of it. + my $frag = $stat; $frag =~ s/;+\s*$//; + if ($frag !~ /(?:{|;)/) { +#print "skip<$line_nr_next>\n"; + $suppress_statement = $line_nr_next; + } # Find the real next line. $realline_next = $line_nr_next; @@ -1942,6 +1954,9 @@ sub process { # Check relative indent for conditionals and blocks. if ($line =~ /\b(?:(?:if|while|for)\s*\(|do\b)/ && $line !~ /^.\s*#/ && $line !~ /\}\s*while\s*/) { + ($stat, $cond, $line_nr_next, $remain_next, $off_next) = + ctx_statement_block($linenr, $realcnt, 0) + if (!defined $stat); my ($s, $c) = ($stat, $cond); substr($s, 0, length($c), ''); @@ -2620,6 +2635,9 @@ sub process { # Check for illegal assignment in if conditional -- and check for trailing # statements after the conditional. if ($line =~ /do\s*(?!{)/) { + ($stat, $cond, $line_nr_next, $remain_next, $off_next) = + ctx_statement_block($linenr, $realcnt, 0) + if (!defined $stat); my ($stat_next) = ctx_statement_block($line_nr_next, $remain_next, $off_next); $stat_next =~ s/\n./\n /g; From a13858033a3a993147d190317cc9d709f0a1b819 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:03 -0800 Subject: [PATCH 087/124] checkpatch: only apply kconfig help checks for options which prompt The intent of this check is to catch the options which the user will see and ensure they are properly described. It is also common for internal only options to have a brief description. Allow this form. Reported-by: Steven Rostedt Tested-by: Steven Rostedt Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 618c0b5db0beed..d8ac16ab5e61e5 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1674,19 +1674,26 @@ sub process { # Only applies when adding the entry originally, after that we do not have # sufficient context to determine whether it is indeed long enough. if ($realfile =~ /Kconfig/ && - $line =~ /\+\s*(?:---)?help(?:---)?$/) { + $line =~ /.\s*config\s+/) { my $length = 0; my $cnt = $realcnt; my $ln = $linenr + 1; my $f; + my $is_start = 0; my $is_end = 0; - while ($cnt > 0 && defined $lines[$ln - 1]) { + for (; $cnt > 0 && defined $lines[$ln - 1]; $ln++) { $f = $lines[$ln - 1]; $cnt-- if ($lines[$ln - 1] !~ /^-/); $is_end = $lines[$ln - 1] =~ /^\+/; - $ln++; next if ($f =~ /^-/); + + if ($lines[$ln - 1] =~ /.\s*(?:bool|tristate)\s*\"/) { + $is_start = 1; + } elsif ($lines[$ln - 1] =~ /.\s*(?:---)?help(?:---)?$/) { + $length = -1; + } + $f =~ s/^.//; $f =~ s/#.*//; $f =~ s/^\s+//; @@ -1698,8 +1705,8 @@ sub process { $length++; } WARN("CONFIG_DESCRIPTION", - "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_end && $length < 4); - #print "is_end<$is_end> length<$length>\n"; + "please write a paragraph that describes the config symbol fully\n" . $herecurr) if ($is_start && $is_end && $length < 4); + #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } if (($realfile =~ /Makefile.*/ || $realfile =~ /Kbuild.*/) && From 87a53877185627b49a903023255425bda78f890c Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:04 -0800 Subject: [PATCH 088/124] checkpatch: fix EXPORT_SYMBOL handling following a function The following fragment defeats the DEVICE_ATTR style handing, check for and ignore the close brace '}' in this context: int foo() { } DEVICE_ATTR(link_power_management_policy, S_IRUGO | S_IWUSR, ata_scsi_lpm_show, ata_scsi_lpm_put); EXPORT_SYMBOL_GPL(dev_attr_link_power_management_policy); Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index d8ac16ab5e61e5..afc656d0058990 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2131,7 +2131,7 @@ sub process { # XXX(foo); # EXPORT_SYMBOL(something_foo); my $name = $1; - if ($stat =~ /^.([A-Z_]+)\s*\(\s*($Ident)/ && + if ($stat =~ /^(?:.\s*}\s*\n)?.([A-Z_]+)\s*\(\s*($Ident)/ && $name =~ /^${Ident}_$2/) { #print "FOO C name<$name>\n"; $suppress_export{$realline_next} = 1; From 72f115f94d500fc72f78c5df8104a98f8b9cc273 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:06 -0800 Subject: [PATCH 089/124] checkpatch: complex macro should allow the empty do while loop It is common to stub out a function as below, this is triggering a complex macro format incorrectly. Sort this out: #define cma_early_regions_reserve(reserve) do { } while (0) Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index afc656d0058990..ca6d0fb229f2c2 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2838,7 +2838,7 @@ sub process { $dstat !~ /^(?:$Ident|-?$Constant)$/ && # 10 // foo() $dstat !~ /$exceptions/ && $dstat !~ /^\.$Ident\s*=/ && # .foo = - $dstat !~ /^do\s*$Constant\s*while\s*$Constant;$/ && # do {...} while (...); + $dstat !~ /^do\s*$Constant\s*while\s*$Constant;?$/ && # do {...} while (...); // do {...} while (...) $dstat !~ /^for\s*$Constant$/ && # for (...) $dstat !~ /^for\s*$Constant\s+(?:$Ident|-?$Constant)$/ && # for (...) bar() $dstat !~ /^do\s*{/ && # do {... From e01886ada28741d7cb2cfb3224e9caccfbc1a2d5 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:08 -0800 Subject: [PATCH 090/124] checkpatch: fix 'return is not a function' square bracket handling We are incorrectly matching square brackets '[' and ']' leading to false positives on more complex functions as below: return (dt3155_fbuffer[m]->ready_head - dt3155_fbuffer[m]->ready_len + dt3155_fbuffer[m]->nbuffers)% (dt3155_fbuffer[m]->nbuffers); Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ca6d0fb229f2c2..5e3f4191f95966 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2609,7 +2609,7 @@ sub process { # Flatten any parentheses $value =~ s/\(/ \(/g; $value =~ s/\)/\) /g; - while ($value =~ s/\[[^\{\}]*\]/1/ || + while ($value =~ s/\[[^\[\]]*\]/1/ || $value !~ /(?:$Ident|-?$Constant)\s* $Compare\s* (?:$Ident|-?$Constant)/x && From c81769fdc84ed7c6eb3cc5cecb194324a5e4c8ad Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:10 -0800 Subject: [PATCH 091/124] checkpatch: fix complex macros handling of square brackets Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 5e3f4191f95966..ba7bcf34ad01e6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2815,7 +2815,7 @@ sub process { # Flatten any parentheses and braces while ($dstat =~ s/\([^\(\)]*\)/1/ || $dstat =~ s/\{[^\{\}]*\}/1/ || - $dstat =~ s/\[[^\{\}]*\]/1/) + $dstat =~ s/\[[^\[\]]*\]/1/) { } From addcdcea99514bee64b5bf091ac9fd2fc5da65cf Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:11 -0800 Subject: [PATCH 092/124] checkpatch: ensure cast type is unique in the context parser Ensure the cast type is unique in the context parser, we do not want them to detect as a comma ','. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index ba7bcf34ad01e6..497416c8360be6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1032,7 +1032,7 @@ sub annotate_values { } elsif ($cur =~ /^(\(\s*$Type\s*)\)/ && $av_pending eq '_') { print "CAST($1)\n" if ($dbg_values > 1); push(@av_paren_type, $type); - $type = 'C'; + $type = 'c'; } elsif ($cur =~ /^($Type)\s*(?:$Ident|,|\)|\(|\s*$)/) { print "DECLARE($1)\n" if ($dbg_values > 1); From 6b48db24e30d371bc54566667b82ca3d64aab80a Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:13 -0800 Subject: [PATCH 093/124] checkpatch: typeof may have more complex arguments typeof may have various more complex forms as its arguement, not just an identifier. For now allow us to leak to the first close perenthesis ')'. Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 497416c8360be6..eb4b55940c0ea0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -315,7 +315,7 @@ sub build_types { $NonptrType = qr{ (?:$Modifier\s+|const\s+)* (?: - (?:typeof|__typeof__)\s*\(\s*\**\s*$Ident\s*\)| + (?:typeof|__typeof__)\s*\([^\)]*\)| (?:$typeTypedefs\b)| (?:${all}\b) ) From bfcb2cc798a14230d22b6dd999e2e680623de622 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 10 Jan 2012 15:10:15 -0800 Subject: [PATCH 094/124] checkpatch: catch all occurences of type and cast spacing errors per line Fix up type and cast spacing checks such that all occurences on a line are examined and reported. For example the line below has a valid cast and a bad type, but currently we check the cast first which is good and stop: u16* bar = (u16 *)baz; We will also only report one of the errors in this example: u16* bar = (u16*)bad; Move to iterating across all casts and all types, reporting any failure. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index eb4b55940c0ea0..e3bfcbe8a520b6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2209,8 +2209,9 @@ sub process { # * goes on variable not on type # (char*[ const]) - if ($line =~ m{\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\)}) { - my ($from, $to) = ($1, $1); + while ($line =~ m{(\($NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)\))}g) { + #print "AA<$1>\n"; + my ($from, $to) = ($2, $2); # Should start with a space. $to =~ s/^(\S)/ $1/; @@ -2225,8 +2226,10 @@ sub process { ERROR("POINTER_LOCATION", "\"(foo$from)\" should be \"(foo$to)\"\n" . $herecurr); } - } elsif ($line =~ m{\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident)}) { - my ($from, $to, $ident) = ($1, $1, $2); + } + while ($line =~ m{(\b$NonptrType(\s*(?:$Modifier\b\s*|\*\s*)+)($Ident))}g) { + #print "BB<$1>\n"; + my ($from, $to, $ident) = ($2, $2, $3); # Should start with a space. $to =~ s/^(\S)/ $1/; From 5742332dea5560d6c449b007d9539dbdc8ee531b Mon Sep 17 00:00:00 2001 From: Joakim Tjernlund Date: Tue, 10 Jan 2012 15:10:18 -0800 Subject: [PATCH 095/124] crc32: optimize inner loop Taking a pointer reference to each row in the crc table matrix, one can reduce the inner loop with a few insn's Signed-off-by: Joakim Tjernlund Cc: Bob Pearson Cc: Frank Zago Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/crc32.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/crc32.c b/lib/crc32.c index a6e633a48cea88..4b35d2b4437cc7 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -51,20 +51,21 @@ static inline u32 crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN -# define DO_CRC(x) crc = tab[0][(crc ^ (x)) & 255] ^ (crc >> 8) -# define DO_CRC4 crc = tab[3][(crc) & 255] ^ \ - tab[2][(crc >> 8) & 255] ^ \ - tab[1][(crc >> 16) & 255] ^ \ - tab[0][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) +# define DO_CRC4 crc = t3[(crc) & 255] ^ \ + t2[(crc >> 8) & 255] ^ \ + t1[(crc >> 16) & 255] ^ \ + t0[(crc >> 24) & 255] # else -# define DO_CRC(x) crc = tab[0][((crc >> 24) ^ (x)) & 255] ^ (crc << 8) -# define DO_CRC4 crc = tab[0][(crc) & 255] ^ \ - tab[1][(crc >> 8) & 255] ^ \ - tab[2][(crc >> 16) & 255] ^ \ - tab[3][(crc >> 24) & 255] +# define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) +# define DO_CRC4 crc = t0[(crc) & 255] ^ \ + t1[(crc >> 8) & 255] ^ \ + t2[(crc >> 16) & 255] ^ \ + t3[(crc >> 24) & 255] # endif const u32 *b; size_t rem_len; + const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; /* Align it */ if (unlikely((long)buf & 3 && len)) { From e39f560239984c3098237ad94c9449b1494163f8 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:10:21 -0800 Subject: [PATCH 096/124] fs: binfmt_elf: create Kconfig variable for PIE randomization Randomization of PIE load address is hard coded in binfmt_elf.c for X86 and ARM. Create a new Kconfig variable (CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE) for this and use it instead. Thus architecture specific policy is pushed out of the generic binfmt_elf.c and into the architecture Kconfig files. X86 and ARM Kconfigs are modified to select the new variable so there is no change in behavior. A follow on patch will select it for MIPS too. Signed-off-by: David Daney Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Acked-by: H. Peter Anvin Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 1 + arch/x86/Kconfig | 1 + fs/Kconfig.binfmt | 3 +++ fs/binfmt_elf.c | 2 +- 4 files changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9d66dfc33a5ac2..98a6459cd39814 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -16,6 +16,7 @@ config ARM select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL) select HAVE_DYNAMIC_FTRACE if (!XIP_KERNEL) select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1d2a69dd36d89c..d6ddc0bfe36a8b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -62,6 +62,7 @@ config X86 select ANON_INODES select HAVE_ARCH_KMEMCHECK select HAVE_USER_RETURN_NOTIFIER + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL select HAVE_TEXT_POKE_SMP select HAVE_GENERIC_HARDIRQS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 79e2ca7973b7a2..e95d1b64082cae 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,6 +27,9 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF +config ARCH_BINFMT_ELF_RANDOMIZE_PIE + bool + config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 21ac5ee4b43f3e..bcb884e2d613e7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -794,7 +794,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#if defined(CONFIG_X86) || defined(CONFIG_ARM) +#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE /* Memory randomization might have been switched off * in runtime via sysctl. * If that is the case, retain the original non-zero From e26d196cc88c5436ca93532cabce87dbd8bf49b4 Mon Sep 17 00:00:00 2001 From: David Daney Date: Tue, 10 Jan 2012 15:10:22 -0800 Subject: [PATCH 097/124] MIPS: randomize PIE load address ... by selecting ARCH_BINFMT_ELF_RANDOMIZE_PIE Signed-off-by: David Daney Cc: Russell King Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Alexander Viro Cc: H. Peter Anvin Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a7636d3ddc6a37..c529cfe52c5336 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -16,6 +16,7 @@ config MIPS select HAVE_FUNCTION_GRAPH_TRACER select HAVE_KPROBES select HAVE_KRETPROBES + select ARCH_BINFMT_ELF_RANDOMIZE_PIE select RTC_LIB if !MACH_LOONGSON select GENERIC_ATOMIC64 if !64BIT select HAVE_DMA_ATTRS From b43c1ea4d622b6951377de92edfb219d893e23ef Mon Sep 17 00:00:00 2001 From: Ondrej Zary Date: Tue, 10 Jan 2012 15:10:26 -0800 Subject: [PATCH 098/124] drivers/rtc/rtc-cmos.c: fix broken NVRAM bank 2 writing Fix writing to NVRAM bank 2 in rtc-cmos driver. It never worked since its introduction in 2.6.28 because of a typo. Signed-off-by: Ondrej Zary Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-cmos.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 05beb6c1ca79c7..d7782aa0994395 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -164,7 +164,7 @@ static inline unsigned char cmos_read_bank2(unsigned char addr) static inline void cmos_write_bank2(unsigned char val, unsigned char addr) { outb(addr, RTC_PORT(2)); - outb(val, RTC_PORT(2)); + outb(val, RTC_PORT(3)); } #else From 7287be1d0ac8c82999b67c2a33517c6ec9cfdbe7 Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 10 Jan 2012 15:10:32 -0800 Subject: [PATCH 099/124] drivers/rtc/rtc-mxc.c: fix setting time for MX1 SoC There is no way to track year in the i.MX1 RTC: Days Counter register is 9-bit wide only. Attempt to save date after 1970-01-01 plus 512 days causes endless loop in mxc_rtc_set_mmss(). Fix this by resetting year to 1970. [akpm@linux-foundation.org: use conventional comment layout] Signed-off-by: Yauhen Kharuzhy Cc: Daniel Mack Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mxc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 39e41fbdf08ba6..11b7b614fc8d6e 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -290,6 +290,17 @@ static int mxc_rtc_read_time(struct device *dev, struct rtc_time *tm) */ static int mxc_rtc_set_mmss(struct device *dev, unsigned long time) { + /* + * TTC_DAYR register is 9-bit in MX1 SoC, save time and day of year only + */ + if (cpu_is_mx1()) { + struct rtc_time tm; + + rtc_time_to_tm(time, &tm); + tm.tm_year = 70; + rtc_tm_to_time(&tm, &time); + } + /* Avoid roll-over from reading the different registers */ do { set_alarm_or_time(dev, MXC_RTC_TIME, time); From c92182ee0b5a33c74e4b6c0ded36166e4ef3bc3e Mon Sep 17 00:00:00 2001 From: Yauhen Kharuzhy Date: Tue, 10 Jan 2012 15:10:34 -0800 Subject: [PATCH 100/124] drivers/rtc/rtc-mxc.c: make alarm work Fix alarm IRQ handling, make the alarm one-shot. Cleanup black magick with a validation of already validated time data. Add ability to wake the system with alarm. [akpm@linux-foundation.org: fix CONFIG_PM=n build] Signed-off-by: Yauhen Kharuzhy Cc: Daniel Mack Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mxc.c | 112 ++++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 53 deletions(-) diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 11b7b614fc8d6e..5e1d64ee52289b 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -155,7 +155,6 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm) { struct rtc_time alarm_tm, now_tm; unsigned long now, time; - int ret; struct platform_device *pdev = to_platform_device(dev); struct rtc_plat_data *pdata = platform_get_drvdata(pdev); void __iomem *ioaddr = pdata->ioaddr; @@ -168,21 +167,33 @@ static int rtc_update_alarm(struct device *dev, struct rtc_time *alrm) alarm_tm.tm_hour = alrm->tm_hour; alarm_tm.tm_min = alrm->tm_min; alarm_tm.tm_sec = alrm->tm_sec; - rtc_tm_to_time(&now_tm, &now); rtc_tm_to_time(&alarm_tm, &time); - if (time < now) { - time += 60 * 60 * 24; - rtc_time_to_tm(time, &alarm_tm); - } - - ret = rtc_tm_to_time(&alarm_tm, &time); - /* clear all the interrupt status bits */ writew(readw(ioaddr + RTC_RTCISR), ioaddr + RTC_RTCISR); set_alarm_or_time(dev, MXC_RTC_ALARM, time); - return ret; + return 0; +} + +static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit, + unsigned int enabled) +{ + struct platform_device *pdev = to_platform_device(dev); + struct rtc_plat_data *pdata = platform_get_drvdata(pdev); + void __iomem *ioaddr = pdata->ioaddr; + u32 reg; + + spin_lock_irq(&pdata->rtc->irq_lock); + reg = readw(ioaddr + RTC_RTCIENR); + + if (enabled) + reg |= bit; + else + reg &= ~bit; + + writew(reg, ioaddr + RTC_RTCIENR); + spin_unlock_irq(&pdata->rtc->irq_lock); } /* This function is the RTC interrupt service routine. */ @@ -199,13 +210,12 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) /* clear interrupt sources */ writew(status, ioaddr + RTC_RTCISR); - /* clear alarm interrupt if it has occurred */ - if (status & RTC_ALM_BIT) - status &= ~RTC_ALM_BIT; - /* update irq data & counter */ - if (status & RTC_ALM_BIT) + if (status & RTC_ALM_BIT) { events |= (RTC_AF | RTC_IRQF); + /* RTC alarm should be one-shot */ + mxc_rtc_irq_enable(&pdev->dev, RTC_ALM_BIT, 0); + } if (status & RTC_1HZ_BIT) events |= (RTC_UF | RTC_IRQF); @@ -213,9 +223,6 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) if (status & PIT_ALL_ON) events |= (RTC_PF | RTC_IRQF); - if ((status & RTC_ALM_BIT) && rtc_valid_tm(&pdata->g_rtc_alarm)) - rtc_update_alarm(&pdev->dev, &pdata->g_rtc_alarm); - rtc_update_irq(pdata->rtc, 1, events); spin_unlock_irq(&pdata->rtc->irq_lock); @@ -242,26 +249,6 @@ static void mxc_rtc_release(struct device *dev) spin_unlock_irq(&pdata->rtc->irq_lock); } -static void mxc_rtc_irq_enable(struct device *dev, unsigned int bit, - unsigned int enabled) -{ - struct platform_device *pdev = to_platform_device(dev); - struct rtc_plat_data *pdata = platform_get_drvdata(pdev); - void __iomem *ioaddr = pdata->ioaddr; - u32 reg; - - spin_lock_irq(&pdata->rtc->irq_lock); - reg = readw(ioaddr + RTC_RTCIENR); - - if (enabled) - reg |= bit; - else - reg &= ~bit; - - writew(reg, ioaddr + RTC_RTCIENR); - spin_unlock_irq(&pdata->rtc->irq_lock); -} - static int mxc_rtc_alarm_irq_enable(struct device *dev, unsigned int enabled) { mxc_rtc_irq_enable(dev, RTC_ALM_BIT, enabled); @@ -335,21 +322,7 @@ static int mxc_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alrm) struct rtc_plat_data *pdata = platform_get_drvdata(pdev); int ret; - if (rtc_valid_tm(&alrm->time)) { - if (alrm->time.tm_sec > 59 || - alrm->time.tm_hour > 23 || - alrm->time.tm_min > 59) - return -EINVAL; - - ret = rtc_update_alarm(dev, &alrm->time); - } else { - ret = rtc_valid_tm(&alrm->time); - if (ret) - return ret; - - ret = rtc_update_alarm(dev, &alrm->time); - } - + ret = rtc_update_alarm(dev, &alrm->time); if (ret) return ret; @@ -435,6 +408,9 @@ static int __init mxc_rtc_probe(struct platform_device *pdev) pdata->irq = -1; } + if (pdata->irq >=0) + device_init_wakeup(&pdev->dev, 1); + rtc = rtc_device_register(pdev->name, &pdev->dev, &mxc_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { @@ -470,9 +446,39 @@ static int __exit mxc_rtc_remove(struct platform_device *pdev) return 0; } +#ifdef CONFIG_PM +static int mxc_rtc_suspend(struct device *dev) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + enable_irq_wake(pdata->irq); + + return 0; +} + +static int mxc_rtc_resume(struct device *dev) +{ + struct rtc_plat_data *pdata = dev_get_drvdata(dev); + + if (device_may_wakeup(dev)) + disable_irq_wake(pdata->irq); + + return 0; +} + +static struct dev_pm_ops mxc_rtc_pm_ops = { + .suspend = mxc_rtc_suspend, + .resume = mxc_rtc_resume, +}; +#endif + static struct platform_driver mxc_rtc_driver = { .driver = { .name = "mxc_rtc", +#ifdef CONFIG_PM + .pm = &mxc_rtc_pm_ops, +#endif .owner = THIS_MODULE, }, .remove = __exit_p(mxc_rtc_remove), From 10d065e65b0be33e868f9c6da67026b5111480d8 Mon Sep 17 00:00:00 2001 From: Robert Marklund Date: Tue, 10 Jan 2012 15:10:35 -0800 Subject: [PATCH 101/124] rtc/ab8500: don't disable IRQ:s when suspending We want this driver to be able to wake up the system. Signed-off-by: Robert Marklund Signed-off-by: Linus Walleij Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ab8500.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index e346705aae92f1..d6bfc4f3658e43 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -316,8 +316,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return err; } - err = request_threaded_irq(irq, NULL, rtc_alarm_handler, 0, - "ab8500-rtc", rtc); + err = request_threaded_irq(irq, NULL, rtc_alarm_handler, + IRQF_NO_SUSPEND, "ab8500-rtc", rtc); if (err < 0) { rtc_device_unregister(rtc); return err; From b62581e6241c33b9fef45117f86830058738371f Mon Sep 17 00:00:00 2001 From: Andrew Lynn Date: Tue, 10 Jan 2012 15:10:38 -0800 Subject: [PATCH 102/124] rtc/ab8500: set can_wake flag Set can_wake flag so wakealarm property is visible in sysfs. Signed-off-by: Andrew Lynn Reviewed-by: Jonas ABERG Signed-off-by: Linus Walleij Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ab8500.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index d6bfc4f3658e43..82a348095a715d 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -308,6 +308,8 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return -ENODEV; } + device_init_wakeup(&pdev->dev, true); + rtc = rtc_device_register("ab8500-rtc", &pdev->dev, &ab8500_rtc_ops, THIS_MODULE); if (IS_ERR(rtc)) { From 012e52e15e7ebbc7b08165e8f4b10f71a3e6810b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 10 Jan 2012 15:10:41 -0800 Subject: [PATCH 103/124] drivers/rtc/rtc-ab8500.c: change msleep() to usleep_range() The resolution of msleep is related to HZ, so with HZ set to 100 any msleep of less than 10ms will become ~10ms. This is not what we want. Use the hrtimer-based usleep_range() and allow for some slack in the non-critical path so we have more control of what is happening here. Signed-off-by: Linus Walleij Cc: Jonas Aaberg Cc: Alessandro Zummo Cc: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-ab8500.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 82a348095a715d..919b2e5cb7f035 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -90,7 +90,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm) /* Early AB8500 chips will not clear the rtc read request bit */ if (abx500_get_chip_id(dev) == 0) { - msleep(1); + usleep_range(1000, 1000); } else { /* Wait for some cycles after enabling the rtc read in ab8500 */ while (time_before(jiffies, timeout)) { @@ -102,7 +102,7 @@ static int ab8500_rtc_read_time(struct device *dev, struct rtc_time *tm) if (!(value & RTC_READ_REQUEST)) break; - msleep(1); + usleep_range(1000, 5000); } } @@ -295,7 +295,7 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) return err; /* Wait for reset by the PorRtc */ - msleep(1); + usleep_range(1000, 5000); err = abx500_get_register_interruptible(&pdev->dev, AB8500_RTC, AB8500_RTC_STAT_REG, &rtc_ctrl); From dda367ac064d7473d397b1965019fb3be7cfb6b0 Mon Sep 17 00:00:00 2001 From: Mark Godfrey Date: Tue, 10 Jan 2012 15:10:42 -0800 Subject: [PATCH 104/124] rtc/ab8500: add calibration attribute to AB8500 RTC The rtc_calibration attribute allows user-space to get and set the AB8500's RtcCalibration register. The AB8500 will then use the value in this register to compensate for RTC drift every 60 seconds. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mark Godfrey Signed-off-by: Linus Walleij Acked-by: Jean-Christophe PLAGNIOL-VILLARD Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ...ysfs-class-rtc-rtc0-device-rtc_calibration | 12 ++ drivers/rtc/rtc-ab8500.c | 112 ++++++++++++++++++ 2 files changed, 124 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration diff --git a/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration b/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration new file mode 100644 index 00000000000000..4cf1e72222d9c4 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-rtc-rtc0-device-rtc_calibration @@ -0,0 +1,12 @@ +What: Attribute for calibrating ST-Ericsson AB8500 Real Time Clock +Date: Oct 2011 +KernelVersion: 3.0 +Contact: Mark Godfrey +Description: The rtc_calibration attribute allows the userspace to + calibrate the AB8500.s 32KHz Real Time Clock. + Every 60 seconds the AB8500 will correct the RTC's value + by adding to it the value of this attribute. + The range of the attribute is -127 to +127 in units of + 30.5 micro-seconds (half-parts-per-million of the 32KHz clock) +Users: The /vendor/st-ericsson/base_utilities/core/rtc_calibration + daemon uses this interface. diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 919b2e5cb7f035..df7bfc304c5e8b 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -258,6 +258,109 @@ static int ab8500_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alarm) return ab8500_rtc_irq_enable(dev, alarm->enabled); } + +static int ab8500_rtc_set_calibration(struct device *dev, int calibration) +{ + int retval; + u8 rtccal = 0; + + /* + * Check that the calibration value (which is in units of 0.5 + * parts-per-million) is in the AB8500's range for RtcCalibration + * register. -128 (0x80) is not permitted because the AB8500 uses + * a sign-bit rather than two's complement, so 0x80 is just another + * representation of zero. + */ + if ((calibration < -127) || (calibration > 127)) { + dev_err(dev, "RtcCalibration value outside permitted range\n"); + return -EINVAL; + } + + /* + * The AB8500 uses sign (in bit7) and magnitude (in bits0-7) + * so need to convert to this sort of representation before writing + * into RtcCalibration register... + */ + if (calibration >= 0) + rtccal = 0x7F & calibration; + else + rtccal = ~(calibration - 1) | 0x80; + + retval = abx500_set_register_interruptible(dev, AB8500_RTC, + AB8500_RTC_CALIB_REG, rtccal); + + return retval; +} + +static int ab8500_rtc_get_calibration(struct device *dev, int *calibration) +{ + int retval; + u8 rtccal = 0; + + retval = abx500_get_register_interruptible(dev, AB8500_RTC, + AB8500_RTC_CALIB_REG, &rtccal); + if (retval >= 0) { + /* + * The AB8500 uses sign (in bit7) and magnitude (in bits0-7) + * so need to convert value from RtcCalibration register into + * a two's complement signed value... + */ + if (rtccal & 0x80) + *calibration = 0 - (rtccal & 0x7F); + else + *calibration = 0x7F & rtccal; + } + + return retval; +} + +static ssize_t ab8500_sysfs_store_rtc_calibration(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int retval; + int calibration = 0; + + if (sscanf(buf, " %i ", &calibration) != 1) { + dev_err(dev, "Failed to store RTC calibration attribute\n"); + return -EINVAL; + } + + retval = ab8500_rtc_set_calibration(dev, calibration); + + return retval ? retval : count; +} + +static ssize_t ab8500_sysfs_show_rtc_calibration(struct device *dev, + struct device_attribute *attr, char *buf) +{ + int retval = 0; + int calibration = 0; + + retval = ab8500_rtc_get_calibration(dev, &calibration); + if (retval < 0) { + dev_err(dev, "Failed to read RTC calibration attribute\n"); + sprintf(buf, "0\n"); + return retval; + } + + return sprintf(buf, "%d\n", calibration); +} + +static DEVICE_ATTR(rtc_calibration, S_IRUGO | S_IWUSR, + ab8500_sysfs_show_rtc_calibration, + ab8500_sysfs_store_rtc_calibration); + +static int ab8500_sysfs_rtc_register(struct device *dev) +{ + return device_create_file(dev, &dev_attr_rtc_calibration); +} + +static void ab8500_sysfs_rtc_unregister(struct device *dev) +{ + device_remove_file(dev, &dev_attr_rtc_calibration); +} + static irqreturn_t rtc_alarm_handler(int irq, void *data) { struct rtc_device *rtc = data; @@ -327,6 +430,13 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); + + err = ab8500_sysfs_rtc_register(&pdev->dev); + if (err) { + dev_err(&pdev->dev, "sysfs RTC failed to register\n"); + return err; + } + return 0; } @@ -335,6 +445,8 @@ static int __devexit ab8500_rtc_remove(struct platform_device *pdev) struct rtc_device *rtc = platform_get_drvdata(pdev); int irq = platform_get_irq_byname(pdev, "ALARM"); + ab8500_sysfs_rtc_unregister(&pdev->dev); + free_irq(irq, rtc); rtc_device_unregister(rtc); platform_set_drvdata(pdev, NULL); From 2d65943e55bdd538640d0908bc9f3ead138b0431 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:10:43 -0800 Subject: [PATCH 105/124] drivers/rtc/rtc-wm831x.c: remove unused period IRQ handler Due to changes in the RTC core the period interrupt is now unused so delete the code managing it. Signed-off-by: Mark Brown Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-wm831x.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index bdc909bd56da0f..dabbd456dfe11b 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -324,15 +324,6 @@ static irqreturn_t wm831x_alm_irq(int irq, void *data) return IRQ_HANDLED; } -static irqreturn_t wm831x_per_irq(int irq, void *data) -{ - struct wm831x_rtc *wm831x_rtc = data; - - rtc_update_irq(wm831x_rtc->rtc, 1, RTC_IRQF | RTC_UF); - - return IRQ_HANDLED; -} - static const struct rtc_class_ops wm831x_rtc_ops = { .read_time = wm831x_rtc_readtime, .set_mmss = wm831x_rtc_set_mmss, @@ -405,7 +396,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) { struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent); struct wm831x_rtc *wm831x_rtc; - int per_irq = platform_get_irq_byname(pdev, "PER"); int alm_irq = platform_get_irq_byname(pdev, "ALM"); int ret = 0; @@ -433,14 +423,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) goto err; } - ret = request_threaded_irq(per_irq, NULL, wm831x_per_irq, - IRQF_TRIGGER_RISING, "RTC period", - wm831x_rtc); - if (ret != 0) { - dev_err(&pdev->dev, "Failed to request periodic IRQ %d: %d\n", - per_irq, ret); - } - ret = request_threaded_irq(alm_irq, NULL, wm831x_alm_irq, IRQF_TRIGGER_RISING, "RTC alarm", wm831x_rtc); @@ -459,11 +441,9 @@ static int wm831x_rtc_probe(struct platform_device *pdev) static int __devexit wm831x_rtc_remove(struct platform_device *pdev) { struct wm831x_rtc *wm831x_rtc = platform_get_drvdata(pdev); - int per_irq = platform_get_irq_byname(pdev, "PER"); int alm_irq = platform_get_irq_byname(pdev, "ALM"); free_irq(alm_irq, wm831x_rtc); - free_irq(per_irq, wm831x_rtc); rtc_device_unregister(wm831x_rtc->rtc); kfree(wm831x_rtc); From 5f85d20d04cdc4c6ed15022a5ed76907ad88d4ae Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 10 Jan 2012 15:10:44 -0800 Subject: [PATCH 106/124] drivers/rtc/rtc-wm831x.c: convert to devm_kzalloc() Marginally less code and eliminate the possibility of memory leaks. Signed-off-by: Mark Brown Acked-by: Jean-Christophe PLAGNIOL-VILLARD Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-wm831x.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index dabbd456dfe11b..657c6f67b28784 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -399,7 +399,7 @@ static int wm831x_rtc_probe(struct platform_device *pdev) int alm_irq = platform_get_irq_byname(pdev, "ALM"); int ret = 0; - wm831x_rtc = kzalloc(sizeof(*wm831x_rtc), GFP_KERNEL); + wm831x_rtc = devm_kzalloc(&pdev->dev, sizeof(*wm831x_rtc), GFP_KERNEL); if (wm831x_rtc == NULL) return -ENOMEM; @@ -434,7 +434,6 @@ static int wm831x_rtc_probe(struct platform_device *pdev) return 0; err: - kfree(wm831x_rtc); return ret; } @@ -445,7 +444,6 @@ static int __devexit wm831x_rtc_remove(struct platform_device *pdev) free_irq(alm_irq, wm831x_rtc); rtc_device_unregister(wm831x_rtc->rtc); - kfree(wm831x_rtc); return 0; } From 0c4eae66591a292fee70051ea363a8d27aa54102 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:48 -0800 Subject: [PATCH 107/124] rtc: convert drivers/rtc/* to use module_platform_driver() This patch converts the drivers in drivers/rtc/* to use the module_platform_driver() macro which makes the code smaller and a bit simpler. Signed-off-by: Axel Lin Acked-by: Mark Brown Acked-by: Mike Frysinger Acked-by: Guan Xuetao Acked-by: Linus Walleij Acked-by: Haojian Zhuang Cc: Alessandro Zummo Cc: Srinidhi Kasagar Cc: Lars-Peter Clausen Cc: Ben Dooks Cc: John Stultz Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-88pm860x.c | 12 +----------- drivers/rtc/rtc-ab8500.c | 12 +----------- drivers/rtc/rtc-bfin.c | 13 +------------ drivers/rtc/rtc-bq4802.c | 13 +------------ drivers/rtc/rtc-dm355evm.c | 12 +----------- drivers/rtc/rtc-ds1286.c | 13 +------------ drivers/rtc/rtc-ds1511.c | 15 +-------------- drivers/rtc/rtc-ds1553.c | 13 +------------ drivers/rtc/rtc-ds1742.c | 13 +------------ drivers/rtc/rtc-jz4740.c | 12 +----------- drivers/rtc/rtc-lpc32xx.c | 12 +----------- drivers/rtc/rtc-m48t35.c | 13 +------------ drivers/rtc/rtc-m48t59.c | 13 +------------ drivers/rtc/rtc-m48t86.c | 13 +------------ drivers/rtc/rtc-max8925.c | 12 +----------- drivers/rtc/rtc-max8998.c | 12 +----------- drivers/rtc/rtc-mpc5121.c | 12 +----------- drivers/rtc/rtc-mrst.c | 13 +------------ drivers/rtc/rtc-pcf50633.c | 12 +----------- drivers/rtc/rtc-pm8xxx.c | 12 +----------- drivers/rtc/rtc-s3c.c | 16 +--------------- drivers/rtc/rtc-sa1100.c | 13 +------------ drivers/rtc/rtc-spear.c | 12 +----------- drivers/rtc/rtc-stk17ta8.c | 13 +------------ drivers/rtc/rtc-stmp3xxx.c | 13 +------------ drivers/rtc/rtc-v3020.c | 13 +------------ drivers/rtc/rtc-vr41xx.c | 13 +------------ drivers/rtc/rtc-vt8500.c | 12 +----------- drivers/rtc/rtc-wm831x.c | 12 +----------- drivers/rtc/rtc-wm8350.c | 12 +----------- 30 files changed, 30 insertions(+), 351 deletions(-) diff --git a/drivers/rtc/rtc-88pm860x.c b/drivers/rtc/rtc-88pm860x.c index 64b847b7f9705e..f04761e6622ddd 100644 --- a/drivers/rtc/rtc-88pm860x.c +++ b/drivers/rtc/rtc-88pm860x.c @@ -410,17 +410,7 @@ static struct platform_driver pm860x_rtc_driver = { .remove = __devexit_p(pm860x_rtc_remove), }; -static int __init pm860x_rtc_init(void) -{ - return platform_driver_register(&pm860x_rtc_driver); -} -module_init(pm860x_rtc_init); - -static void __exit pm860x_rtc_exit(void) -{ - platform_driver_unregister(&pm860x_rtc_driver); -} -module_exit(pm860x_rtc_exit); +module_platform_driver(pm860x_rtc_driver); MODULE_DESCRIPTION("Marvell 88PM860x RTC driver"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index df7bfc304c5e8b..a0a9810adf0b91 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -463,18 +463,8 @@ static struct platform_driver ab8500_rtc_driver = { .remove = __devexit_p(ab8500_rtc_remove), }; -static int __init ab8500_rtc_init(void) -{ - return platform_driver_register(&ab8500_rtc_driver); -} - -static void __exit ab8500_rtc_exit(void) -{ - platform_driver_unregister(&ab8500_rtc_driver); -} +module_platform_driver(ab8500_rtc_driver); -module_init(ab8500_rtc_init); -module_exit(ab8500_rtc_exit); MODULE_AUTHOR("Virupax Sadashivpetimath "); MODULE_DESCRIPTION("AB8500 RTC Driver"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/rtc/rtc-bfin.c b/drivers/rtc/rtc-bfin.c index 90d866272c8ea5..abfc1a0c07d9cf 100644 --- a/drivers/rtc/rtc-bfin.c +++ b/drivers/rtc/rtc-bfin.c @@ -456,18 +456,7 @@ static struct platform_driver bfin_rtc_driver = { .resume = bfin_rtc_resume, }; -static int __init bfin_rtc_init(void) -{ - return platform_driver_register(&bfin_rtc_driver); -} - -static void __exit bfin_rtc_exit(void) -{ - platform_driver_unregister(&bfin_rtc_driver); -} - -module_init(bfin_rtc_init); -module_exit(bfin_rtc_exit); +module_platform_driver(bfin_rtc_driver); MODULE_DESCRIPTION("Blackfin On-Chip Real Time Clock Driver"); MODULE_AUTHOR("Mike Frysinger "); diff --git a/drivers/rtc/rtc-bq4802.c b/drivers/rtc/rtc-bq4802.c index 128270ce355d08..bf612ef2294177 100644 --- a/drivers/rtc/rtc-bq4802.c +++ b/drivers/rtc/rtc-bq4802.c @@ -218,15 +218,4 @@ static struct platform_driver bq4802_driver = { .remove = __devexit_p(bq4802_remove), }; -static int __init bq4802_init(void) -{ - return platform_driver_register(&bq4802_driver); -} - -static void __exit bq4802_exit(void) -{ - platform_driver_unregister(&bq4802_driver); -} - -module_init(bq4802_init); -module_exit(bq4802_exit); +module_platform_driver(bq4802_driver); diff --git a/drivers/rtc/rtc-dm355evm.c b/drivers/rtc/rtc-dm355evm.c index 2322c43af201bd..d4457afcba8910 100644 --- a/drivers/rtc/rtc-dm355evm.c +++ b/drivers/rtc/rtc-dm355evm.c @@ -161,16 +161,6 @@ static struct platform_driver rtc_dm355evm_driver = { }, }; -static int __init dm355evm_rtc_init(void) -{ - return platform_driver_register(&rtc_dm355evm_driver); -} -module_init(dm355evm_rtc_init); - -static void __exit dm355evm_rtc_exit(void) -{ - platform_driver_unregister(&rtc_dm355evm_driver); -} -module_exit(dm355evm_rtc_exit); +module_platform_driver(rtc_dm355evm_driver); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-ds1286.c b/drivers/rtc/rtc-ds1286.c index 68e6caf2549662..990c3ff489bf1f 100644 --- a/drivers/rtc/rtc-ds1286.c +++ b/drivers/rtc/rtc-ds1286.c @@ -396,21 +396,10 @@ static struct platform_driver ds1286_platform_driver = { .remove = __devexit_p(ds1286_remove), }; -static int __init ds1286_init(void) -{ - return platform_driver_register(&ds1286_platform_driver); -} - -static void __exit ds1286_exit(void) -{ - platform_driver_unregister(&ds1286_platform_driver); -} +module_platform_driver(ds1286_platform_driver); MODULE_AUTHOR("Thomas Bogendoerfer "); MODULE_DESCRIPTION("DS1286 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-ds1286"); - -module_init(ds1286_init); -module_exit(ds1286_exit); diff --git a/drivers/rtc/rtc-ds1511.c b/drivers/rtc/rtc-ds1511.c index 586c244a05d854..761f36bc83a95e 100644 --- a/drivers/rtc/rtc-ds1511.c +++ b/drivers/rtc/rtc-ds1511.c @@ -580,20 +580,7 @@ static struct platform_driver ds1511_rtc_driver = { }, }; - static int __init -ds1511_rtc_init(void) -{ - return platform_driver_register(&ds1511_rtc_driver); -} - - static void __exit -ds1511_rtc_exit(void) -{ - platform_driver_unregister(&ds1511_rtc_driver); -} - -module_init(ds1511_rtc_init); -module_exit(ds1511_rtc_exit); +module_platform_driver(ds1511_rtc_driver); MODULE_AUTHOR("Andrew Sharp "); MODULE_DESCRIPTION("Dallas DS1511 RTC driver"); diff --git a/drivers/rtc/rtc-ds1553.c b/drivers/rtc/rtc-ds1553.c index 1350029044e624..6f0a1b530f2e8b 100644 --- a/drivers/rtc/rtc-ds1553.c +++ b/drivers/rtc/rtc-ds1553.c @@ -361,18 +361,7 @@ static struct platform_driver ds1553_rtc_driver = { }, }; -static __init int ds1553_init(void) -{ - return platform_driver_register(&ds1553_rtc_driver); -} - -static __exit void ds1553_exit(void) -{ - platform_driver_unregister(&ds1553_rtc_driver); -} - -module_init(ds1553_init); -module_exit(ds1553_exit); +module_platform_driver(ds1553_rtc_driver); MODULE_AUTHOR("Atsushi Nemoto "); MODULE_DESCRIPTION("Dallas DS1553 RTC driver"); diff --git a/drivers/rtc/rtc-ds1742.c b/drivers/rtc/rtc-ds1742.c index e3e0f92b60f0d9..76112667c50781 100644 --- a/drivers/rtc/rtc-ds1742.c +++ b/drivers/rtc/rtc-ds1742.c @@ -240,18 +240,7 @@ static struct platform_driver ds1742_rtc_driver = { }, }; -static __init int ds1742_init(void) -{ - return platform_driver_register(&ds1742_rtc_driver); -} - -static __exit void ds1742_exit(void) -{ - platform_driver_unregister(&ds1742_rtc_driver); -} - -module_init(ds1742_init); -module_exit(ds1742_exit); +module_platform_driver(ds1742_rtc_driver); MODULE_AUTHOR("Atsushi Nemoto "); MODULE_DESCRIPTION("Dallas DS1742 RTC driver"); diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index b6473631d18213..1481e362773ef5 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -355,17 +355,7 @@ struct platform_driver jz4740_rtc_driver = { }, }; -static int __init jz4740_rtc_init(void) -{ - return platform_driver_register(&jz4740_rtc_driver); -} -module_init(jz4740_rtc_init); - -static void __exit jz4740_rtc_exit(void) -{ - platform_driver_unregister(&jz4740_rtc_driver); -} -module_exit(jz4740_rtc_exit); +module_platform_driver(jz4740_rtc_driver); MODULE_AUTHOR("Lars-Peter Clausen "); MODULE_LICENSE("GPL"); diff --git a/drivers/rtc/rtc-lpc32xx.c b/drivers/rtc/rtc-lpc32xx.c index ae16250c762f84..ecc1713b2b4f82 100644 --- a/drivers/rtc/rtc-lpc32xx.c +++ b/drivers/rtc/rtc-lpc32xx.c @@ -396,17 +396,7 @@ static struct platform_driver lpc32xx_rtc_driver = { }, }; -static int __init lpc32xx_rtc_init(void) -{ - return platform_driver_register(&lpc32xx_rtc_driver); -} -module_init(lpc32xx_rtc_init); - -static void __exit lpc32xx_rtc_exit(void) -{ - platform_driver_unregister(&lpc32xx_rtc_driver); -} -module_exit(lpc32xx_rtc_exit); +module_platform_driver(lpc32xx_rtc_driver); MODULE_AUTHOR("Kevin Wells "); MODULE_DESCRIPTION("M48T35 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-m48t35"); - -module_init(m48t35_init); -module_exit(m48t35_exit); diff --git a/drivers/rtc/rtc-m48t59.c b/drivers/rtc/rtc-m48t59.c index 28365388fb6c86..30ebfec9fd2b84 100644 --- a/drivers/rtc/rtc-m48t59.c +++ b/drivers/rtc/rtc-m48t59.c @@ -530,18 +530,7 @@ static struct platform_driver m48t59_rtc_driver = { .remove = __devexit_p(m48t59_rtc_remove), }; -static int __init m48t59_rtc_init(void) -{ - return platform_driver_register(&m48t59_rtc_driver); -} - -static void __exit m48t59_rtc_exit(void) -{ - platform_driver_unregister(&m48t59_rtc_driver); -} - -module_init(m48t59_rtc_init); -module_exit(m48t59_rtc_exit); +module_platform_driver(m48t59_rtc_driver); MODULE_AUTHOR("Mark Zhan "); MODULE_DESCRIPTION("M48T59/M48T02/M48T08 RTC driver"); diff --git a/drivers/rtc/rtc-m48t86.c b/drivers/rtc/rtc-m48t86.c index f981287d582b76..863fb3363aa6da 100644 --- a/drivers/rtc/rtc-m48t86.c +++ b/drivers/rtc/rtc-m48t86.c @@ -185,21 +185,10 @@ static struct platform_driver m48t86_rtc_platform_driver = { .remove = __devexit_p(m48t86_rtc_remove), }; -static int __init m48t86_rtc_init(void) -{ - return platform_driver_register(&m48t86_rtc_platform_driver); -} - -static void __exit m48t86_rtc_exit(void) -{ - platform_driver_unregister(&m48t86_rtc_platform_driver); -} +module_platform_driver(m48t86_rtc_platform_driver); MODULE_AUTHOR("Alessandro Zummo "); MODULE_DESCRIPTION("M48T86 RTC driver"); MODULE_LICENSE("GPL"); MODULE_VERSION(DRV_VERSION); MODULE_ALIAS("platform:rtc-m48t86"); - -module_init(m48t86_rtc_init); -module_exit(m48t86_rtc_exit); diff --git a/drivers/rtc/rtc-max8925.c b/drivers/rtc/rtc-max8925.c index 3bc046f427e04c..4a5529346b4746 100644 --- a/drivers/rtc/rtc-max8925.c +++ b/drivers/rtc/rtc-max8925.c @@ -299,17 +299,7 @@ static struct platform_driver max8925_rtc_driver = { .remove = __devexit_p(max8925_rtc_remove), }; -static int __init max8925_rtc_init(void) -{ - return platform_driver_register(&max8925_rtc_driver); -} -module_init(max8925_rtc_init); - -static void __exit max8925_rtc_exit(void) -{ - platform_driver_unregister(&max8925_rtc_driver); -} -module_exit(max8925_rtc_exit); +module_platform_driver(max8925_rtc_driver); MODULE_DESCRIPTION("Maxim MAX8925 RTC driver"); MODULE_AUTHOR("Haojian Zhuang "); diff --git a/drivers/rtc/rtc-max8998.c b/drivers/rtc/rtc-max8998.c index 2e48aa604273ad..7196f438c08957 100644 --- a/drivers/rtc/rtc-max8998.c +++ b/drivers/rtc/rtc-max8998.c @@ -327,17 +327,7 @@ static struct platform_driver max8998_rtc_driver = { .id_table = max8998_rtc_id, }; -static int __init max8998_rtc_init(void) -{ - return platform_driver_register(&max8998_rtc_driver); -} -module_init(max8998_rtc_init); - -static void __exit max8998_rtc_exit(void) -{ - platform_driver_unregister(&max8998_rtc_driver); -} -module_exit(max8998_rtc_exit); +module_platform_driver(max8998_rtc_driver); MODULE_AUTHOR("Minkyu Kang "); MODULE_AUTHOR("Joonyoung Shim "); diff --git a/drivers/rtc/rtc-mpc5121.c b/drivers/rtc/rtc-mpc5121.c index da60915818b68c..9d3caccfc250ff 100644 --- a/drivers/rtc/rtc-mpc5121.c +++ b/drivers/rtc/rtc-mpc5121.c @@ -418,17 +418,7 @@ static struct platform_driver mpc5121_rtc_driver = { .remove = __devexit_p(mpc5121_rtc_remove), }; -static int __init mpc5121_rtc_init(void) -{ - return platform_driver_register(&mpc5121_rtc_driver); -} -module_init(mpc5121_rtc_init); - -static void __exit mpc5121_rtc_exit(void) -{ - platform_driver_unregister(&mpc5121_rtc_driver); -} -module_exit(mpc5121_rtc_exit); +module_platform_driver(mpc5121_rtc_driver); MODULE_LICENSE("GPL"); MODULE_AUTHOR("John Rigby "); diff --git a/drivers/rtc/rtc-mrst.c b/drivers/rtc/rtc-mrst.c index bb21f443fb7038..6cd6c723534491 100644 --- a/drivers/rtc/rtc-mrst.c +++ b/drivers/rtc/rtc-mrst.c @@ -537,18 +537,7 @@ static struct platform_driver vrtc_mrst_platform_driver = { } }; -static int __init vrtc_mrst_init(void) -{ - return platform_driver_register(&vrtc_mrst_platform_driver); -} - -static void __exit vrtc_mrst_exit(void) -{ - platform_driver_unregister(&vrtc_mrst_platform_driver); -} - -module_init(vrtc_mrst_init); -module_exit(vrtc_mrst_exit); +module_platform_driver(vrtc_mrst_platform_driver); MODULE_AUTHOR("Jacob Pan; Feng Tang"); MODULE_DESCRIPTION("Driver for Moorestown virtual RTC"); diff --git a/drivers/rtc/rtc-pcf50633.c b/drivers/rtc/rtc-pcf50633.c index 0c423892923c99..a20202f9ee577d 100644 --- a/drivers/rtc/rtc-pcf50633.c +++ b/drivers/rtc/rtc-pcf50633.c @@ -294,17 +294,7 @@ static struct platform_driver pcf50633_rtc_driver = { .remove = __devexit_p(pcf50633_rtc_remove), }; -static int __init pcf50633_rtc_init(void) -{ - return platform_driver_register(&pcf50633_rtc_driver); -} -module_init(pcf50633_rtc_init); - -static void __exit pcf50633_rtc_exit(void) -{ - platform_driver_unregister(&pcf50633_rtc_driver); -} -module_exit(pcf50633_rtc_exit); +module_platform_driver(pcf50633_rtc_driver); MODULE_DESCRIPTION("PCF50633 RTC driver"); MODULE_AUTHOR("Balaji Rao "); diff --git a/drivers/rtc/rtc-pm8xxx.c b/drivers/rtc/rtc-pm8xxx.c index d420e9d877e85e..9f1d6bcbdf6cf0 100644 --- a/drivers/rtc/rtc-pm8xxx.c +++ b/drivers/rtc/rtc-pm8xxx.c @@ -532,17 +532,7 @@ static struct platform_driver pm8xxx_rtc_driver = { }, }; -static int __init pm8xxx_rtc_init(void) -{ - return platform_driver_register(&pm8xxx_rtc_driver); -} -module_init(pm8xxx_rtc_init); - -static void __exit pm8xxx_rtc_exit(void) -{ - platform_driver_unregister(&pm8xxx_rtc_driver); -} -module_exit(pm8xxx_rtc_exit); +module_platform_driver(pm8xxx_rtc_driver); MODULE_ALIAS("platform:rtc-pm8xxx"); MODULE_DESCRIPTION("PMIC8xxx RTC driver"); diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index 175067a17c46f3..aef40bd2957be7 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -673,21 +673,7 @@ static struct platform_driver s3c_rtc_driver = { }, }; -static char __initdata banner[] = "S3C24XX RTC, (c) 2004,2006 Simtec Electronics\n"; - -static int __init s3c_rtc_init(void) -{ - printk(banner); - return platform_driver_register(&s3c_rtc_driver); -} - -static void __exit s3c_rtc_exit(void) -{ - platform_driver_unregister(&s3c_rtc_driver); -} - -module_init(s3c_rtc_init); -module_exit(s3c_rtc_exit); +module_platform_driver(s3c_rtc_driver); MODULE_DESCRIPTION("Samsung S3C RTC Driver"); MODULE_AUTHOR("Ben Dooks "); diff --git a/drivers/rtc/rtc-sa1100.c b/drivers/rtc/rtc-sa1100.c index fc1ffe97fca191..4595d3e645a735 100644 --- a/drivers/rtc/rtc-sa1100.c +++ b/drivers/rtc/rtc-sa1100.c @@ -435,18 +435,7 @@ static struct platform_driver sa1100_rtc_driver = { }, }; -static int __init sa1100_rtc_init(void) -{ - return platform_driver_register(&sa1100_rtc_driver); -} - -static void __exit sa1100_rtc_exit(void) -{ - platform_driver_unregister(&sa1100_rtc_driver); -} - -module_init(sa1100_rtc_init); -module_exit(sa1100_rtc_exit); +module_platform_driver(sa1100_rtc_driver); MODULE_AUTHOR("Richard Purdie "); MODULE_DESCRIPTION("SA11x0/PXA2xx Realtime Clock Driver (RTC)"); diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 893bac2bb21b61..19a28a671a8e74 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -516,17 +516,7 @@ static struct platform_driver spear_rtc_driver = { }, }; -static int __init rtc_init(void) -{ - return platform_driver_register(&spear_rtc_driver); -} -module_init(rtc_init); - -static void __exit rtc_exit(void) -{ - platform_driver_unregister(&spear_rtc_driver); -} -module_exit(rtc_exit); +module_platform_driver(spear_rtc_driver); MODULE_ALIAS("platform:rtc-spear"); MODULE_AUTHOR("Rajeev Kumar "); diff --git a/drivers/rtc/rtc-stk17ta8.c b/drivers/rtc/rtc-stk17ta8.c index ed3e9b5990315c..7621116bd20d25 100644 --- a/drivers/rtc/rtc-stk17ta8.c +++ b/drivers/rtc/rtc-stk17ta8.c @@ -370,18 +370,7 @@ static struct platform_driver stk17ta8_rtc_driver = { }, }; -static __init int stk17ta8_init(void) -{ - return platform_driver_register(&stk17ta8_rtc_driver); -} - -static __exit void stk17ta8_exit(void) -{ - platform_driver_unregister(&stk17ta8_rtc_driver); -} - -module_init(stk17ta8_init); -module_exit(stk17ta8_exit); +module_platform_driver(stk17ta8_rtc_driver); MODULE_AUTHOR("Thomas Hommel "); MODULE_DESCRIPTION("Simtek STK17TA8 RTC driver"); diff --git a/drivers/rtc/rtc-stmp3xxx.c b/drivers/rtc/rtc-stmp3xxx.c index 7315068daa5979..10287865e33012 100644 --- a/drivers/rtc/rtc-stmp3xxx.c +++ b/drivers/rtc/rtc-stmp3xxx.c @@ -276,18 +276,7 @@ static struct platform_driver stmp3xxx_rtcdrv = { }, }; -static int __init stmp3xxx_rtc_init(void) -{ - return platform_driver_register(&stmp3xxx_rtcdrv); -} - -static void __exit stmp3xxx_rtc_exit(void) -{ - platform_driver_unregister(&stmp3xxx_rtcdrv); -} - -module_init(stmp3xxx_rtc_init); -module_exit(stmp3xxx_rtc_exit); +module_platform_driver(stmp3xxx_rtcdrv); MODULE_DESCRIPTION("STMP3xxx RTC Driver"); MODULE_AUTHOR("dmitry pervushin and " diff --git a/drivers/rtc/rtc-v3020.c b/drivers/rtc/rtc-v3020.c index f71c3ce1803692..bca5d677bc8538 100644 --- a/drivers/rtc/rtc-v3020.c +++ b/drivers/rtc/rtc-v3020.c @@ -393,18 +393,7 @@ static struct platform_driver rtc_device_driver = { }, }; -static __init int v3020_init(void) -{ - return platform_driver_register(&rtc_device_driver); -} - -static __exit void v3020_exit(void) -{ - platform_driver_unregister(&rtc_device_driver); -} - -module_init(v3020_init); -module_exit(v3020_exit); +module_platform_driver(rtc_device_driver); MODULE_DESCRIPTION("V3020 RTC"); MODULE_AUTHOR("Raphael Assenat"); diff --git a/drivers/rtc/rtc-vr41xx.c b/drivers/rtc/rtc-vr41xx.c index c5698cda366a91..fcbfdda2993bea 100644 --- a/drivers/rtc/rtc-vr41xx.c +++ b/drivers/rtc/rtc-vr41xx.c @@ -405,15 +405,4 @@ static struct platform_driver rtc_platform_driver = { }, }; -static int __init vr41xx_rtc_init(void) -{ - return platform_driver_register(&rtc_platform_driver); -} - -static void __exit vr41xx_rtc_exit(void) -{ - platform_driver_unregister(&rtc_platform_driver); -} - -module_init(vr41xx_rtc_init); -module_exit(vr41xx_rtc_exit); +module_platform_driver(rtc_platform_driver); diff --git a/drivers/rtc/rtc-vt8500.c b/drivers/rtc/rtc-vt8500.c index f93f412423c6d4..9e94fb147c26af 100644 --- a/drivers/rtc/rtc-vt8500.c +++ b/drivers/rtc/rtc-vt8500.c @@ -311,17 +311,7 @@ static struct platform_driver vt8500_rtc_driver = { }, }; -static int __init vt8500_rtc_init(void) -{ - return platform_driver_register(&vt8500_rtc_driver); -} -module_init(vt8500_rtc_init); - -static void __exit vt8500_rtc_exit(void) -{ - platform_driver_unregister(&vt8500_rtc_driver); -} -module_exit(vt8500_rtc_exit); +module_platform_driver(vt8500_rtc_driver); MODULE_AUTHOR("Alexey Charkov "); MODULE_DESCRIPTION("VIA VT8500 SoC Realtime Clock Driver (RTC)"); diff --git a/drivers/rtc/rtc-wm831x.c b/drivers/rtc/rtc-wm831x.c index 657c6f67b28784..3b6e6a67e765b3 100644 --- a/drivers/rtc/rtc-wm831x.c +++ b/drivers/rtc/rtc-wm831x.c @@ -468,17 +468,7 @@ static struct platform_driver wm831x_rtc_driver = { }, }; -static int __init wm831x_rtc_init(void) -{ - return platform_driver_register(&wm831x_rtc_driver); -} -module_init(wm831x_rtc_init); - -static void __exit wm831x_rtc_exit(void) -{ - platform_driver_unregister(&wm831x_rtc_driver); -} -module_exit(wm831x_rtc_exit); +module_platform_driver(wm831x_rtc_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("RTC driver for the WM831x series PMICs"); diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c index 66421426e40452..c2e52d15abb295 100644 --- a/drivers/rtc/rtc-wm8350.c +++ b/drivers/rtc/rtc-wm8350.c @@ -486,17 +486,7 @@ static struct platform_driver wm8350_rtc_driver = { }, }; -static int __init wm8350_rtc_init(void) -{ - return platform_driver_register(&wm8350_rtc_driver); -} -module_init(wm8350_rtc_init); - -static void __exit wm8350_rtc_exit(void) -{ - platform_driver_unregister(&wm8350_rtc_driver); -} -module_exit(wm8350_rtc_exit); +module_platform_driver(wm8350_rtc_driver); MODULE_AUTHOR("Mark Brown "); MODULE_DESCRIPTION("RTC driver for the WM8350"); From a46481d7af1e6c59c03f3ddac400d9054f804952 Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:52 -0800 Subject: [PATCH 108/124] drivers/rtc/rtc-mc13xxx.c: make mc13xxx_rtc_idtable static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Axel Lin Acked-by: Uwe Kleine-König Cc: Alessandro Zummo Acked-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-mc13xxx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-mc13xxx.c b/drivers/rtc/rtc-mc13xxx.c index 9d0c3b478d558e..546f6850bffbd9 100644 --- a/drivers/rtc/rtc-mc13xxx.c +++ b/drivers/rtc/rtc-mc13xxx.c @@ -399,7 +399,7 @@ static int __exit mc13xxx_rtc_remove(struct platform_device *pdev) return 0; } -const struct platform_device_id mc13xxx_rtc_idtable[] = { +static const struct platform_device_id mc13xxx_rtc_idtable[] = { { .name = "mc13783-rtc", }, { From 681d0378a9057a92b9e6e51c2112e53d920a092d Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Tue, 10 Jan 2012 15:10:55 -0800 Subject: [PATCH 109/124] drivers/rtc/rtc-jz4740.c: make jz4740_rtc_driver static Signed-off-by: Axel Lin Cc: Lars-Peter Clausen Cc: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-jz4740.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-jz4740.c b/drivers/rtc/rtc-jz4740.c index 1481e362773ef5..05ab227eeff725 100644 --- a/drivers/rtc/rtc-jz4740.c +++ b/drivers/rtc/rtc-jz4740.c @@ -345,7 +345,7 @@ static const struct dev_pm_ops jz4740_pm_ops = { #define JZ4740_RTC_PM_OPS NULL #endif /* CONFIG_PM */ -struct platform_driver jz4740_rtc_driver = { +static struct platform_driver jz4740_rtc_driver = { .probe = jz4740_rtc_probe, .remove = __devexit_p(jz4740_rtc_remove), .driver = { From 6c3fb55793f79bc975df0494c4d56ea6f0b0cc45 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Tue, 10 Jan 2012 15:10:58 -0800 Subject: [PATCH 110/124] drivers/rtc/: remove redundant spi driver bus initialization In ancient times it was necessary to manually initialize the bus field of an spi_driver to spi_bus_type. These days this is done in spi_driver_register(), so we can drop the manual assignment. The patch was generated using the following coccinelle semantic patch: // @@ identifier _driver; @@ struct spi_driver _driver = { .driver = { - .bus = &spi_bus_type, }, }; // Signed-off-by: Lars-Peter Clausen Cc: John Stultz Cc: Alessandro Zummo Cc: Grant Likely Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-m41t93.c | 1 - drivers/rtc/rtc-m41t94.c | 1 - drivers/rtc/rtc-max6902.c | 1 - drivers/rtc/rtc-pcf2123.c | 1 - drivers/rtc/rtc-rs5c348.c | 1 - 5 files changed, 5 deletions(-) diff --git a/drivers/rtc/rtc-m41t93.c b/drivers/rtc/rtc-m41t93.c index 7317d3b9a3d54f..ef71132ff205fc 100644 --- a/drivers/rtc/rtc-m41t93.c +++ b/drivers/rtc/rtc-m41t93.c @@ -200,7 +200,6 @@ static int __devexit m41t93_remove(struct spi_device *spi) static struct spi_driver m41t93_driver = { .driver = { .name = "rtc-m41t93", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = m41t93_probe, diff --git a/drivers/rtc/rtc-m41t94.c b/drivers/rtc/rtc-m41t94.c index e259ed76ae856d..2a4721f617975b 100644 --- a/drivers/rtc/rtc-m41t94.c +++ b/drivers/rtc/rtc-m41t94.c @@ -147,7 +147,6 @@ static int __devexit m41t94_remove(struct spi_device *spi) static struct spi_driver m41t94_driver = { .driver = { .name = "rtc-m41t94", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = m41t94_probe, diff --git a/drivers/rtc/rtc-max6902.c b/drivers/rtc/rtc-max6902.c index 0ec3f588a255dc..1f6b3cc58e8a36 100644 --- a/drivers/rtc/rtc-max6902.c +++ b/drivers/rtc/rtc-max6902.c @@ -154,7 +154,6 @@ static int __devexit max6902_remove(struct spi_device *spi) static struct spi_driver max6902_driver = { .driver = { .name = "rtc-max6902", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = max6902_probe, diff --git a/drivers/rtc/rtc-pcf2123.c b/drivers/rtc/rtc-pcf2123.c index 2ee3bbf7e5ea03..b46c4004d8fe4e 100644 --- a/drivers/rtc/rtc-pcf2123.c +++ b/drivers/rtc/rtc-pcf2123.c @@ -340,7 +340,6 @@ static int __devexit pcf2123_remove(struct spi_device *spi) static struct spi_driver pcf2123_driver = { .driver = { .name = "rtc-pcf2123", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = pcf2123_probe, diff --git a/drivers/rtc/rtc-rs5c348.c b/drivers/rtc/rtc-rs5c348.c index 971bc8e08da660..ce2ca8523ddd5d 100644 --- a/drivers/rtc/rtc-rs5c348.c +++ b/drivers/rtc/rtc-rs5c348.c @@ -229,7 +229,6 @@ static int __devexit rs5c348_remove(struct spi_device *spi) static struct spi_driver rs5c348_driver = { .driver = { .name = "rtc-rs5c348", - .bus = &spi_bus_type, .owner = THIS_MODULE, }, .probe = rs5c348_probe, From 948170f8944dfd29d13612fff48110a9814daeb1 Mon Sep 17 00:00:00 2001 From: Benoit Cousson Date: Tue, 10 Jan 2012 15:10:59 -0800 Subject: [PATCH 111/124] drivers/rtc/rtc-twl.c: add DT support for RTC inside twl4030/twl6030 Add the DT support for the TI rtc-twl present in the twl4030 and twl6030 devices. Signed-off-by: Benoit Cousson Acked-by: Alessandro Zummo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/devicetree/bindings/rtc/twl-rtc.txt | 12 ++++++++++++ drivers/rtc/rtc-twl.c | 10 ++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) create mode 100644 Documentation/devicetree/bindings/rtc/twl-rtc.txt diff --git a/Documentation/devicetree/bindings/rtc/twl-rtc.txt b/Documentation/devicetree/bindings/rtc/twl-rtc.txt new file mode 100644 index 00000000000000..596e0c97be7aa3 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/twl-rtc.txt @@ -0,0 +1,12 @@ +* TI twl RTC + +The TWL family (twl4030/6030) contains a RTC. + +Required properties: +- compatible : Should be twl4030-rtc + +Examples: + +rtc@0 { + compatible = "ti,twl4030-rtc"; +}; diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 20687d55e7a72d..d43b4f6eb4e420 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -550,6 +550,11 @@ static int twl_rtc_resume(struct platform_device *pdev) #define twl_rtc_resume NULL #endif +static const struct of_device_id twl_rtc_of_match[] = { + {.compatible = "ti,twl4030-rtc", }, + { }, +}; +MODULE_DEVICE_TABLE(of, twl_rtc_of_match); MODULE_ALIAS("platform:twl_rtc"); static struct platform_driver twl4030rtc_driver = { @@ -559,8 +564,9 @@ static struct platform_driver twl4030rtc_driver = { .suspend = twl_rtc_suspend, .resume = twl_rtc_resume, .driver = { - .owner = THIS_MODULE, - .name = "twl_rtc", + .owner = THIS_MODULE, + .name = "twl_rtc", + .of_match_table = twl_rtc_of_match, }, }; From e74a8f2edb92cb690b467cea0ab652c509e9f624 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 10 Jan 2012 15:11:02 -0800 Subject: [PATCH 112/124] drivers/rtc/interface.c: fix alarm rollover when day or month is out-of-range Commit f44f7f96a20a ("RTC: Initialize kernel state from RTC") introduced a potential infinite loop. If an alarm time contains a wildcard month and an invalid day (> 31), or a wildcard year and an invalid month (>= 12), the loop searching for the next matching date will never terminate. Treat the invalid values as wildcards. Fixes , Reported-by: leo weppelman Reported-by: "P. van Gaans" Signed-off-by: Ben Hutchings Signed-off-by: Jonathan Nieder Cc: Mark Brown Cc: Marcelo Roberto Jimenez Cc: Thomas Gleixner Cc: John Stultz Acked-by: Alessandro Zummo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/interface.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 8e286259a007fb..8a1c031391d66f 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -228,11 +228,11 @@ int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) alarm->time.tm_hour = now.tm_hour; /* For simplicity, only support date rollover for now */ - if (alarm->time.tm_mday == -1) { + if (alarm->time.tm_mday < 1 || alarm->time.tm_mday > 31) { alarm->time.tm_mday = now.tm_mday; missing = day; } - if (alarm->time.tm_mon == -1) { + if ((unsigned)alarm->time.tm_mon >= 12) { alarm->time.tm_mon = now.tm_mon; if (missing == none) missing = month; From b18c1c6e0c90cbcd38ba879bd63a44c94e4f7301 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 10 Jan 2012 15:11:05 -0800 Subject: [PATCH 113/124] reiserfs: delete comments referring to the BKL Signed-off-by: Davidlohr Bueso Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index eb711060a6f2b7..cce8e8710df2b7 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2896,14 +2896,13 @@ int journal_transaction_should_end(struct reiserfs_transaction_handle *th, journal->j_cnode_free < (journal->j_trans_max * 3)) { return 1; } - /* protected by the BKL here */ + journal->j_len_alloc += new_alloc; th->t_blocks_allocated += new_alloc ; return 0; } -/* this must be called inside a transaction, and requires the -** kernel_lock to be held +/* this must be called inside a transaction */ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) { @@ -2914,8 +2913,7 @@ void reiserfs_block_writes(struct reiserfs_transaction_handle *th) return; } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_allow_writes(struct super_block *s) { @@ -2924,8 +2922,7 @@ void reiserfs_allow_writes(struct super_block *s) wake_up(&journal->j_join_wait); } -/* this must be called without a transaction started, and does not -** require BKL +/* this must be called without a transaction started */ void reiserfs_wait_on_write_block(struct super_block *s) { From f32485be8397ad811312bc055d2e2a5906bc7576 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:07 -0800 Subject: [PATCH 114/124] reiserfs: delay reiserfs lock until journal initialization In the mount path, transactions that are made before journal initialization don't involve the filesystem. We can delay the reiserfs lock until we play with the journal. Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/bitmap.c | 3 --- fs/reiserfs/super.c | 43 ++++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index a945cd26522868..70de42f09f1d1c 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1364,10 +1364,7 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); - /* Avoid lock recursion in fault case */ - reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); - reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 1d42e707d5fadc..620dd5d9e1b13a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1746,22 +1746,11 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) mutex_init(&REISERFS_SB(s)->lock); REISERFS_SB(s)->lock_depth = -1; - /* - * This function is called with the bkl, which also was the old - * locking used here. - * do_journal_begin() will soon check if we hold the lock (ie: was the - * bkl). This is likely because do_journal_begin() has several another - * callers because at this time, it doesn't seem to be necessary to - * protect against anything. - * Anyway, let's be conservative and lock for now. - */ - reiserfs_write_lock(s); - jdev_name = NULL; if (reiserfs_parse_options (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, &commit_max_age, qf_names, &qfmt) == 0) { - goto error; + goto error_unlocked; } if (jdev_name && jdev_name[0]) { REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL); @@ -1777,7 +1766,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (blocks) { SWARN(silent, s, "jmacd-7", "resize option for remount only"); - goto error; + goto error_unlocked; } /* try old format (undistributed bitmap, super block in 8-th 1k block of a device) */ @@ -1787,7 +1776,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) else if (read_super_block(s, REISERFS_DISK_OFFSET_IN_BYTES)) { SWARN(silent, s, "sh-2021", "can not find reiserfs on %s", reiserfs_bdevname(s)); - goto error; + goto error_unlocked; } rs = SB_DISK_SUPER_BLOCK(s); @@ -1803,7 +1792,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) "or increase size of your LVM partition"); SWARN(silent, s, "", "Or may be you forgot to " "reboot after fdisk when it told you to"); - goto error; + goto error_unlocked; } sbi->s_mount_state = SB_REISERFS_STATE(s); @@ -1811,8 +1800,9 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if ((errval = reiserfs_init_bitmap_cache(s))) { SWARN(silent, s, "jmacd-8", "unable to read bitmap"); - goto error; + goto error_unlocked; } + errval = -EINVAL; #ifdef CONFIG_REISERFS_CHECK SWARN(silent, s, "", "CONFIG_REISERFS_CHECK is set ON"); @@ -1835,6 +1825,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (reiserfs_barrier_flush(s)) { printk("reiserfs: using flush barriers\n"); } + + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + // set_device_ro(s->s_dev, 1) ; if (journal_init(s, jdev_name, old_format, commit_max_age)) { SWARN(silent, s, "sh-2022", @@ -1995,12 +1996,16 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) return (0); error: - if (jinit_done) { /* kill the commit thread, free journal ram */ + reiserfs_write_unlock(s); + +error_unlocked: + /* kill the commit thread, free journal ram */ + if (jinit_done) { + reiserfs_write_lock(s); journal_release_error(NULL, s); + reiserfs_write_unlock(s); } - reiserfs_write_unlock(s); - reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) brelse(SB_BUFFER_WITH_SB(s)); From 37c69b98d0dca54d9eb72226bbf2e211aaaf126e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:09 -0800 Subject: [PATCH 115/124] reiserfs: don't lock journal_init() journal_init() doesn't need the lock since no operation on the filesystem is involved there. journal_read() and get_list_bitmap() have yet to be reviewed carefully though before removing the lock there. Just keep the it around these two calls for safety. Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/journal.c | 53 ++++++++++++++++--------------------------- fs/reiserfs/super.c | 21 +++++++++-------- 2 files changed, 31 insertions(+), 43 deletions(-) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index cce8e8710df2b7..c3cf54fd4de327 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2678,16 +2678,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, char b[BDEVNAME_SIZE]; int ret; - /* - * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS - * dependency inversion warnings. - */ - reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); - reiserfs_write_lock(sb); return 1; } INIT_LIST_HEAD(&journal->j_bitmap_nodes); @@ -2695,10 +2689,8 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, - reiserfs_bmap_count(sb)); - reiserfs_write_lock(sb); - if (ret) + if (reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb))) goto free_and_return; allocate_bitmap_nodes(sb); @@ -2727,27 +2719,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name, goto free_and_return; } - /* - * We need to unlock here to avoid creating the following - * dependency: - * reiserfs_lock -> sysfs_mutex - * Because the reiserfs mmap path creates the following dependency: - * mm->mmap -> reiserfs_lock, hence we have - * mm->mmap -> reiserfs_lock ->sysfs_mutex - * This would ends up in a circular dependency with sysfs readdir path - * which does sysfs_mutex -> mm->mmap_sem - * This is fine because the reiserfs lock is useless in mount path, - * at least until we call journal_begin. We keep it for paranoid - * reasons. - */ - reiserfs_write_unlock(sb); if (journal_init_dev(sb, journal, j_dev_name) != 0) { - reiserfs_write_lock(sb); reiserfs_warning(sb, "sh-462", "unable to initialize jornal device"); goto free_and_return; } - reiserfs_write_lock(sb); rs = SB_DISK_SUPER_BLOCK(sb); @@ -2829,9 +2805,7 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); - reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); - reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; @@ -2848,24 +2822,37 @@ int journal_init(struct super_block *sb, const char *j_dev_name, init_journal_hash(sb); jl = journal->j_current_jl; + + /* + * get_list_bitmap() may call flush_commit_list() which + * requires the lock. Calling flush_commit_list() shouldn't happen + * this early but I like to be paranoid. + */ + reiserfs_write_lock(sb); jl->j_list_bitmap = get_list_bitmap(sb, jl); + reiserfs_write_unlock(sb); if (!jl->j_list_bitmap) { reiserfs_warning(sb, "journal-2005", "get_list_bitmap failed for journal list 0"); goto free_and_return; } - if (journal_read(sb) < 0) { + + /* + * Journal_read needs to be inspected in order to push down + * the lock further inside (or even remove it). + */ + reiserfs_write_lock(sb); + ret = journal_read(sb); + reiserfs_write_unlock(sb); + if (ret < 0) { reiserfs_warning(sb, "reiserfs-2006", "Replay Failure, unable to mount"); goto free_and_return; } reiserfs_mounted_fs_count++; - if (reiserfs_mounted_fs_count <= 1) { - reiserfs_write_unlock(sb); + if (reiserfs_mounted_fs_count <= 1) commit_wq = alloc_workqueue("reiserfs", WQ_MEM_RECLAIM, 0); - reiserfs_write_lock(sb); - } INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); journal->j_work_sb = sb; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 620dd5d9e1b13a..61b60380d2ce37 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1826,6 +1826,17 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) printk("reiserfs: using flush barriers\n"); } + // set_device_ro(s->s_dev, 1) ; + if (journal_init(s, jdev_name, old_format, commit_max_age)) { + SWARN(silent, s, "sh-2022", + "unable to initialize journal space"); + goto error_unlocked; + } else { + jinit_done = 1; /* once this is set, journal_release must be called + ** if we error out of the mount + */ + } + /* * This path assumed to be called with the BKL in the old times. * Now we have inherited the big reiserfs lock from it and many @@ -1836,16 +1847,6 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) */ reiserfs_write_lock(s); - // set_device_ro(s->s_dev, 1) ; - if (journal_init(s, jdev_name, old_format, commit_max_age)) { - SWARN(silent, s, "sh-2022", - "unable to initialize journal space"); - goto error; - } else { - jinit_done = 1; /* once this is set, journal_release must be called - ** if we error out of the mount - */ - } if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); From 9b467e6ebebbe75288aeb7e816ffbb5d35d6eaa3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Jan 2012 15:11:11 -0800 Subject: [PATCH 116/124] reiserfs: don't lock root inode searching Nothing requires that we lock the filesystem until the root inode is provided. Also iget5_locked() triggers a warning because we are holding the filesystem lock while allocating the inode, which result in a lockdep suspicion that we have a lock inversion against the reclaim path: [ 1986.896979] ================================= [ 1986.896990] [ INFO: inconsistent lock state ] [ 1986.896997] 3.1.1-main #8 [ 1986.897001] --------------------------------- [ 1986.897007] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. [ 1986.897016] kswapd0/16 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 1986.897023] (&REISERFS_SB(s)->lock){+.+.?.}, at: [] reiserfs_write_lock+0x20/0x2a [ 1986.897044] {RECLAIM_FS-ON-W} state was registered at: [ 1986.897050] [] mark_held_locks+0xae/0xd0 [ 1986.897060] [] lockdep_trace_alloc+0x7d/0x91 [ 1986.897068] [] kmem_cache_alloc+0x1a/0x93 [ 1986.897078] [] reiserfs_alloc_inode+0x13/0x3d [ 1986.897088] [] alloc_inode+0x14/0x5f [ 1986.897097] [] iget5_locked+0x62/0x13a [ 1986.897106] [] reiserfs_fill_super+0x410/0x8b9 [ 1986.897114] [] mount_bdev+0x10b/0x159 [ 1986.897123] [] get_super_block+0x10/0x12 [ 1986.897131] [] mount_fs+0x59/0x12d [ 1986.897138] [] vfs_kern_mount+0x45/0x7a [ 1986.897147] [] do_kern_mount+0x2f/0xb0 [ 1986.897155] [] do_mount+0x5c2/0x612 [ 1986.897163] [] sys_mount+0x61/0x8f [ 1986.897170] [] sysenter_do_call+0x12/0x32 [ 1986.897181] irq event stamp: 7509691 [ 1986.897186] hardirqs last enabled at (7509691): [] kmem_cache_alloc+0x6e/0x93 [ 1986.897197] hardirqs last disabled at (7509690): [] kmem_cache_alloc+0x24/0x93 [ 1986.897209] softirqs last enabled at (7508896): [] __do_softirq+0xee/0xfd [ 1986.897222] softirqs last disabled at (7508859): [] do_softirq+0x50/0x9d [ 1986.897234] [ 1986.897235] other info that might help us debug this: [ 1986.897242] Possible unsafe locking scenario: [ 1986.897244] [ 1986.897250] CPU0 [ 1986.897254] ---- [ 1986.897257] lock(&REISERFS_SB(s)->lock); [ 1986.897265] [ 1986.897269] lock(&REISERFS_SB(s)->lock); [ 1986.897276] [ 1986.897277] *** DEADLOCK *** [ 1986.897278] [ 1986.897286] no locks held by kswapd0/16. [ 1986.897291] [ 1986.897292] stack backtrace: [ 1986.897299] Pid: 16, comm: kswapd0 Not tainted 3.1.1-main #8 [ 1986.897306] Call Trace: [ 1986.897314] [] ? printk+0xf/0x11 [ 1986.897324] [] print_usage_bug+0x20e/0x21a [ 1986.897332] [] ? print_irq_inversion_bug+0x172/0x172 [ 1986.897341] [] mark_lock+0x27f/0x483 [ 1986.897349] [] __lock_acquire+0x628/0x1472 [ 1986.897358] [] lock_acquire+0x47/0x5e [ 1986.897366] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897384] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897397] [] mutex_lock_nested+0x35/0x26f [ 1986.897409] [] ? reiserfs_write_lock+0x20/0x2a [ 1986.897421] [] reiserfs_write_lock+0x20/0x2a [ 1986.897433] [] map_block_for_writepage+0xc9/0x590 [ 1986.897448] [] ? create_empty_buffers+0x33/0x8f [ 1986.897461] [] ? get_parent_ip+0xb/0x31 [ 1986.897472] [] ? sub_preempt_count+0x81/0x8e [ 1986.897485] [] ? _raw_spin_unlock+0x27/0x3d [ 1986.897496] [] ? get_parent_ip+0xb/0x31 [ 1986.897508] [] reiserfs_writepage+0x1b9/0x3e7 [ 1986.897521] [] ? clear_page_dirty_for_io+0xcb/0xde [ 1986.897533] [] ? trace_hardirqs_on_caller+0x108/0x138 [ 1986.897546] [] ? trace_hardirqs_on+0xb/0xd [ 1986.897559] [] shrink_page_list+0x34f/0x5e2 [ 1986.897572] [] shrink_inactive_list+0x172/0x22c [ 1986.897585] [] shrink_zone+0x303/0x3b1 [ 1986.897597] [] ? _raw_spin_unlock+0x27/0x3d [ 1986.897611] [] kswapd+0x3b7/0x5f2 The deadlock shouldn't happen since we are doing that allocation in the mount path, the filesystem is not available for any reclaim. Still the warning is annoying. To solve this, acquire the lock later only where we need it, right before calling reiserfs_read_locked_inode() that wants to lock to walk the tree. Reported-by: Knut Petersen Signed-off-by: Frederic Weisbecker Cc: Al Viro Cc: Christoph Hellwig Cc: Jeff Mahoney Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/super.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 61b60380d2ce37..e12d8b97cd4dbf 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1519,9 +1519,7 @@ static int read_super_block(struct super_block *s, int offset) static int reread_meta_blocks(struct super_block *s) { ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); - reiserfs_write_unlock(s); wait_on_buffer(SB_BUFFER_WITH_SB(s)); - reiserfs_write_lock(s); if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { reiserfs_warning(s, "reiserfs-2504", "error reading the super"); return 1; @@ -1837,24 +1835,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) */ } - /* - * This path assumed to be called with the BKL in the old times. - * Now we have inherited the big reiserfs lock from it and many - * reiserfs helpers called in the mount path and elsewhere require - * this lock to be held even if it's not always necessary. Let's be - * conservative and hold it early. The window can be reduced after - * careful review of the code. - */ - reiserfs_write_lock(s); - if (reread_meta_blocks(s)) { SWARN(silent, s, "jmacd-9", "unable to reread meta blocks after journal init"); - goto error; + goto error_unlocked; } if (replay_only(s)) - goto error; + goto error_unlocked; if (bdev_read_only(s->s_bdev) && !(s->s_flags & MS_RDONLY)) { SWARN(silent, s, "clm-7000", @@ -1868,9 +1856,19 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) reiserfs_init_locked_inode, (void *)(&args)); if (!root_inode) { SWARN(silent, s, "jmacd-10", "get root inode failed"); - goto error; + goto error_unlocked; } + /* + * This path assumed to be called with the BKL in the old times. + * Now we have inherited the big reiserfs lock from it and many + * reiserfs helpers called in the mount path and elsewhere require + * this lock to be held even if it's not always necessary. Let's be + * conservative and hold it early. The window can be reduced after + * careful review of the code. + */ + reiserfs_write_lock(s); + if (root_inode->i_state & I_NEW) { reiserfs_read_locked_inode(root_inode, &args); unlock_new_inode(root_inode); From f350b1778f1b7713ef54fbc7e079e09e2fe098b9 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:14 -0800 Subject: [PATCH 117/124] sparc: make SA_NOMASK a synonym of SA_NODEFER Unlike other architectures, sparc currently has no SA_NODEFER definition but only the older SA_NOMASK. Since SA_NOMASK is the historical name for SA_NODEFER, add SA_NODEFER and copy what other architectures do by making SA_NOMASK a synonym for SA_NODEFER. Signed-off-by: Matt Fleming Acked-by: Oleg Nesterov Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/signal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sparc/include/asm/signal.h b/arch/sparc/include/asm/signal.h index e49b828a2471d5..aa42fe30d5b96f 100644 --- a/arch/sparc/include/asm/signal.h +++ b/arch/sparc/include/asm/signal.h @@ -143,10 +143,11 @@ struct sigstack { #define SA_ONSTACK _SV_SSTACK #define SA_RESTART _SV_INTR #define SA_ONESHOT _SV_RESET -#define SA_NOMASK 0x20u +#define SA_NODEFER 0x20u #define SA_NOCLDWAIT 0x100u #define SA_SIGINFO 0x200u +#define SA_NOMASK SA_NODEFER #define SIG_BLOCK 0x01 /* for blocking signals */ #define SIG_UNBLOCK 0x02 /* for unblocking signals */ From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Tue, 10 Jan 2012 15:11:17 -0800 Subject: [PATCH 118/124] signal: add block_sigmask() for adding sigmask to current->blocked Abstract the code sequence for adding a signal handler's sa_mask to current->blocked because the sequence is identical for all architectures. Furthermore, in the past some architectures actually got this code wrong, so introduce a wrapper that all architectures can use. Signed-off-by: Matt Fleming Signed-off-by: Oleg Nesterov Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Tejun Heo Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal.c | 6 +----- include/linux/signal.h | 1 + kernel/signal.c | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 54ddaeb221c18c..46a01bdc27e284 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -682,7 +682,6 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, struct pt_regs *regs) { - sigset_t blocked; int ret; /* Are we from a system call? */ @@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(&blocked, sig); - set_current_blocked(&blocked); + block_sigmask(ka, sig); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/include/linux/signal.h b/include/linux/signal.h index a822300a253b0d..7987ce74874b36 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *); extern int show_unhandled_signals; extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie); +extern void block_sigmask(struct k_sigaction *ka, int signr); extern void exit_signals(struct task_struct *tsk); extern struct kmem_cache *sighand_cachep; diff --git a/kernel/signal.c b/kernel/signal.c index bb0efa5705ed32..d532f1709fbf44 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2318,6 +2318,27 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, return signr; } +/** + * block_sigmask - add @ka's signal mask to current->blocked + * @ka: action for @signr + * @signr: signal that has been successfully delivered + * + * This function should be called when a signal has succesfully been + * delivered. It adds the mask of signals for @ka to current->blocked + * so that they are blocked during the execution of the signal + * handler. In addition, @signr will be blocked unless %SA_NODEFER is + * set in @ka->sa.sa_flags. + */ +void block_sigmask(struct k_sigaction *ka, int signr) +{ + sigset_t blocked; + + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(&blocked, signr); + set_current_blocked(&blocked); +} + /* * It could be that complete_signal() picked us to notify about the * group-wide signal. Other threads should be notified now to take From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 10 Jan 2012 15:11:20 -0800 Subject: [PATCH 119/124] procfs: make proc_get_link to use dentry instead of inode Prepare the ground for the next "map_files" patch which needs a name of a link file to analyse. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Tejun Heo Cc: Vasiliy Kulikov Cc: "Kirill A. Shutemov" Cc: Alexey Dobriyan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 20 ++++++++++---------- include/linux/proc_fs.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 1aab5fe05a1b12..e31d95055c67c5 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -166,9 +166,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } -static int proc_cwd_link(struct inode *inode, struct path *path) +static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -183,9 +183,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } -static int proc_root_link(struct inode *inode, struct path *path) +static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(inode); + struct task_struct *task = get_proc_task(dentry->d_inode); int result = -ENOENT; if (task) { @@ -1456,13 +1456,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; -static int proc_exe_link(struct inode *inode, struct path *exe_path) +static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(inode); + task = get_proc_task(dentry->d_inode); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1492,7 +1492,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); + error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path); out: return ERR_PTR(error); } @@ -1531,7 +1531,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; - error = PROC_I(inode)->op.proc_get_link(inode, &path); + error = PROC_I(inode)->op.proc_get_link(dentry, &path); if (error) goto out; @@ -1823,9 +1823,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info) return -ENOENT; } -static int proc_fd_link(struct inode *inode, struct path *path) +static int proc_fd_link(struct dentry *dentry, struct path *path) { - return proc_fd_info(inode, path, NULL); + return proc_fd_info(dentry->d_inode, path, NULL); } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 6d9e575519cc1c..85c5073062390b 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; union proc_op { - int (*proc_get_link)(struct inode *, struct path *); + int (*proc_get_link)(struct dentry *, struct path *); int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Tue, 10 Jan 2012 15:11:23 -0800 Subject: [PATCH 120/124] procfs: introduce the /proc//map_files/ directory This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", the target is the file. Opening a symlink results in a file that point exactly to the same inode as them vma's one. For example the ls -l of some arbitrary /proc//map_files/ | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so This *helps* checkpointing process in three ways: 1. When dumping a task mappings we do know exact file that is mapped by particular region. We do this by opening /proc/$pid/map_files/$address symlink the way we do with file descriptors. 2. This also helps in determining which anonymous shared mappings are shared with each other by comparing the inodes of them. 3. When restoring a set of processes in case two of them has a mapping shared, we map the memory by the 1st one and then open its /proc/$pid/map_files/$address file and map it by the 2nd task. Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable re-reading and reparsing for this text file which slows down restore procedure significantly. Also as being pointed in (3) it is a way easier to use top level shared mapping in children as /proc/$pid/map_files/$address when needed. [akpm@linux-foundation.org: coding-style fixes] [gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE] Signed-off-by: Pavel Emelyanov Signed-off-by: Cyrill Gorcunov Reviewed-by: Vasiliy Kulikov Reviewed-by: "Kirill A. Shutemov" Cc: Tejun Heo Cc: Alexey Dobriyan Cc: Al Viro Cc: Pavel Machek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 355 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/mm.h | 12 ++ 2 files changed, 367 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index e31d95055c67c5..4d755fed3ecbe0 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -83,6 +83,7 @@ #include #include #include +#include #ifdef CONFIG_HARDWALL #include #endif @@ -134,6 +135,8 @@ struct pid_entry { NULL, &proc_single_file_operations, \ { .proc_show = show } ) +static int proc_fd_permission(struct inode *inode, int mask); + /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. @@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = { .llseek = default_llseek, }; +#ifdef CONFIG_CHECKPOINT_RESTORE + +/* + * dname_to_vma_addr - maps a dentry name into two unsigned longs + * which represent vma start and end addresses. + */ +static int dname_to_vma_addr(struct dentry *dentry, + unsigned long *start, unsigned long *end) +{ + if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2) + return -EINVAL; + + return 0; +} + +static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + bool exact_vma_exists = false; + struct mm_struct *mm = NULL; + struct task_struct *task; + const struct cred *cred; + struct inode *inode; + int status = 0; + + if (nd && nd->flags & LOOKUP_RCU) + return -ECHILD; + + if (!capable(CAP_SYS_ADMIN)) { + status = -EACCES; + goto out_notask; + } + + inode = dentry->d_inode; + task = get_proc_task(inode); + if (!task) + goto out_notask; + + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto out; + + mm = get_task_mm(task); + if (!mm) + goto out; + + if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { + down_read(&mm->mmap_sem); + exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end); + up_read(&mm->mmap_sem); + } + + mmput(mm); + + if (exact_vma_exists) { + if (task_dumpable(task)) { + rcu_read_lock(); + cred = __task_cred(task); + inode->i_uid = cred->euid; + inode->i_gid = cred->egid; + rcu_read_unlock(); + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + status = 1; + } + +out: + put_task_struct(task); + +out_notask: + if (status <= 0) + d_drop(dentry); + + return status; +} + +static const struct dentry_operations tid_map_files_dentry_operations = { + .d_revalidate = map_files_d_revalidate, + .d_delete = pid_delete_dentry, +}; + +static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + int rc; + + rc = -ENOENT; + task = get_proc_task(dentry->d_inode); + if (!task) + goto out; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + goto out; + + rc = dname_to_vma_addr(dentry, &vm_start, &vm_end); + if (rc) + goto out_mmput; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (vma && vma->vm_file) { + *path = vma->vm_file->f_path; + path_get(path); + rc = 0; + } + up_read(&mm->mmap_sem); + +out_mmput: + mmput(mm); +out: + return rc; +} + +struct map_files_info { + struct file *file; + unsigned long len; + unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */ +}; + +static struct dentry * +proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, + struct task_struct *task, const void *ptr) +{ + const struct file *file = ptr; + struct proc_inode *ei; + struct inode *inode; + + if (!file) + return ERR_PTR(-ENOENT); + + inode = proc_pid_make_inode(dir->i_sb, task); + if (!inode) + return ERR_PTR(-ENOENT); + + ei = PROC_I(inode); + ei->op.proc_get_link = proc_map_files_get_link; + + inode->i_op = &proc_pid_link_inode_operations; + inode->i_size = 64; + inode->i_mode = S_IFLNK; + + if (file->f_mode & FMODE_READ) + inode->i_mode |= S_IRUSR; + if (file->f_mode & FMODE_WRITE) + inode->i_mode |= S_IWUSR; + + d_set_d_op(dentry, &tid_map_files_dentry_operations); + d_add(dentry, inode); + + return NULL; +} + +static struct dentry *proc_map_files_lookup(struct inode *dir, + struct dentry *dentry, struct nameidata *nd) +{ + unsigned long vm_start, vm_end; + struct vm_area_struct *vma; + struct task_struct *task; + struct dentry *result; + struct mm_struct *mm; + + result = ERR_PTR(-EACCES); + if (!capable(CAP_SYS_ADMIN)) + goto out; + + result = ERR_PTR(-ENOENT); + task = get_proc_task(dir); + if (!task) + goto out; + + result = ERR_PTR(-EACCES); + if (lock_trace(task)) + goto out_put_task; + + result = ERR_PTR(-ENOENT); + if (dname_to_vma_addr(dentry, &vm_start, &vm_end)) + goto out_unlock; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + + down_read(&mm->mmap_sem); + vma = find_exact_vma(mm, vm_start, vm_end); + if (!vma) + goto out_no_vma; + + result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file); + +out_no_vma: + up_read(&mm->mmap_sem); + mmput(mm); +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return result; +} + +static const struct inode_operations proc_map_files_inode_operations = { + .lookup = proc_map_files_lookup, + .permission = proc_fd_permission, + .setattr = proc_setattr, +}; + +static int +proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + struct dentry *dentry = filp->f_path.dentry; + struct inode *inode = dentry->d_inode; + struct vm_area_struct *vma; + struct task_struct *task; + struct mm_struct *mm; + ino_t ino; + int ret; + + ret = -EACCES; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENOENT; + task = get_proc_task(inode); + if (!task) + goto out; + + ret = -EACCES; + if (lock_trace(task)) + goto out_put_task; + + ret = 0; + switch (filp->f_pos) { + case 0: + ino = inode->i_ino; + if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + case 1: + ino = parent_ino(dentry); + if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) + goto out_unlock; + filp->f_pos++; + default: + { + unsigned long nr_files, pos, i; + struct flex_array *fa = NULL; + struct map_files_info info; + struct map_files_info *p; + + mm = get_task_mm(task); + if (!mm) + goto out_unlock; + down_read(&mm->mmap_sem); + + nr_files = 0; + + /* + * We need two passes here: + * + * 1) Collect vmas of mapped files with mmap_sem taken + * 2) Release mmap_sem and instantiate entries + * + * otherwise we get lockdep complained, since filldir() + * routine might require mmap_sem taken in might_fault(). + */ + + for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + if (vma->vm_file && ++pos > filp->f_pos) + nr_files++; + } + + if (nr_files) { + fa = flex_array_alloc(sizeof(info), nr_files, + GFP_KERNEL); + if (!fa || flex_array_prealloc(fa, 0, nr_files, + GFP_KERNEL)) { + ret = -ENOMEM; + if (fa) + flex_array_free(fa); + up_read(&mm->mmap_sem); + mmput(mm); + goto out_unlock; + } + for (i = 0, vma = mm->mmap, pos = 2; vma; + vma = vma->vm_next) { + if (!vma->vm_file) + continue; + if (++pos <= filp->f_pos) + continue; + + get_file(vma->vm_file); + info.file = vma->vm_file; + info.len = snprintf(info.name, + sizeof(info.name), "%lx-%lx", + vma->vm_start, vma->vm_end); + if (flex_array_put(fa, i++, &info, GFP_KERNEL)) + BUG(); + } + } + up_read(&mm->mmap_sem); + + for (i = 0; i < nr_files; i++) { + p = flex_array_get(fa, i); + ret = proc_fill_cache(filp, dirent, filldir, + p->name, p->len, + proc_map_files_instantiate, + task, p->file); + if (ret) + break; + filp->f_pos++; + fput(p->file); + } + for (; i < nr_files; i++) { + /* + * In case of error don't forget + * to put rest of file refs. + */ + p = flex_array_get(fa, i); + fput(p->file); + } + if (fa) + flex_array_free(fa); + mmput(mm); + } + } + +out_unlock: + unlock_trace(task); +out_put_task: + put_task_struct(task); +out: + return ret; +} + +static const struct file_operations proc_map_files_operations = { + .read = generic_read_dir, + .readdir = proc_map_files_readdir, + .llseek = default_llseek, +}; + +#endif /* CONFIG_CHECKPOINT_RESTORE */ + /* * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). @@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), +#ifdef CONFIG_CHECKPOINT_RESTORE + DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations), +#endif DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET diff --git a/include/linux/mm.h b/include/linux/mm.h index 5568553a41fd60..6eba2cc016c95c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } +/* Look up the first VMA which exactly match the interval vm_start ... vm_end */ +static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, + unsigned long vm_start, unsigned long vm_end) +{ + struct vm_area_struct *vma = find_vma(mm, vm_start); + + if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) + vma = NULL; + + return vma; +} + #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); #else From 97412950b10e64f347aec4a9b759395c2465adf6 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:27 -0800 Subject: [PATCH 121/124] procfs: parse mount options Add support for procfs mount options. Actual mount options are coming in the next patches. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 10 +++++++++ fs/proc/internal.h | 1 + fs/proc/root.c | 55 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 51a176622b8fda..27c762f34870a2 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -17,7 +18,9 @@ #include #include #include +#include #include +#include #include #include @@ -101,12 +104,19 @@ void __init proc_init_inodecache(void) init_once); } +static int proc_show_options(struct seq_file *seq, struct dentry *root) +{ + return 0; +} + static const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .destroy_inode = proc_destroy_inode, .drop_inode = generic_delete_inode, .evict_inode = proc_evict_inode, .statfs = simple_statfs, + .remount_fs = proc_remount, + .show_options = proc_show_options, }; static void __pde_users_dec(struct proc_dir_entry *pde) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7838e5cfec145d..292577531ad13e 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -117,6 +117,7 @@ void pde_put(struct proc_dir_entry *pde); int proc_fill_super(struct super_block *); struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); +int proc_remount(struct super_block *sb, int *flags, char *data); /* * These are generic /proc routines that use the internal diff --git a/fs/proc/root.c b/fs/proc/root.c index 03102d978180eb..6a8ac1d361a923 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "internal.h" @@ -36,6 +37,48 @@ static int proc_set_super(struct super_block *sb, void *data) return err; } +enum { + Opt_err, +}; + +static const match_table_t tokens = { + {Opt_err, NULL}, +}; + +static int proc_parse_options(char *options, struct pid_namespace *pid) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + + pr_debug("proc: options = %s\n", options); + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + args[0].to = args[0].from = 0; + token = match_token(p, tokens, args); + switch (token) { + default: + pr_err("proc: unrecognized mount option \"%s\" " + "or missing value\n", p); + return 0; + } + } + + return 1; +} + +int proc_remount(struct super_block *sb, int *flags, char *data) +{ + struct pid_namespace *pid = sb->s_fs_info; + return !proc_parse_options(data, pid); +} + static struct dentry *proc_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { @@ -43,11 +86,15 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, struct super_block *sb; struct pid_namespace *ns; struct proc_inode *ei; + char *options; - if (flags & MS_KERNMOUNT) + if (flags & MS_KERNMOUNT) { ns = (struct pid_namespace *)data; - else + options = NULL; + } else { ns = current->nsproxy->pid_ns; + options = data; + } sb = sget(fs_type, proc_test_super, proc_set_super, ns); if (IS_ERR(sb)) @@ -55,6 +102,10 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, if (!sb->s_root) { sb->s_flags = flags; + if (!proc_parse_options(options, ns)) { + deactivate_locked_super(sb); + return ERR_PTR(-EINVAL); + } err = proc_fill_super(sb); if (err) { deactivate_locked_super(sb); From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Tue, 10 Jan 2012 15:11:31 -0800 Subject: [PATCH 122/124] procfs: add hidepid= and gid= mount options Add support for mount options to restrict access to /proc/PID/ directories. The default backward-compatible "relaxed" behaviour is left untouched. The first mount option is called "hidepid" and its value defines how much info about processes we want to be available for non-owners: hidepid=0 (default) means the old behavior - anybody may read all world-readable /proc/PID/* files. hidepid=1 means users may not access any /proc// directories, but their own. Sensitive files like cmdline, sched*, status are now protected against other users. As permission checking done in proc_pid_permission() and files' permissions are left untouched, programs expecting specific files' modes are not confused. hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other users. It doesn't mean that it hides whether a process exists (it can be learned by other means, e.g. by kill -0 $PID), but it hides process' euid and egid. It compicates intruder's task of gathering info about running processes, whether some daemon runs with elevated privileges, whether another user runs some sensitive program, whether other users run any program at all, etc. gid=XXX defines a group that will be able to gather all processes' info (as in hidepid=0 mode). This group should be used instead of putting nonroot user in sudoers file or something. However, untrusted users (like daemons, etc.) which are not supposed to monitor the tasks in the whole system should not be added to the group. hidepid=1 or higher is designed to restrict access to procfs files, which might reveal some sensitive private information like precise keystrokes timings: http://www.openwall.com/lists/oss-security/2011/11/05/3 hidepid=1/2 doesn't break monitoring userspace tools. ps, top, pgrep, and conky gracefully handle EPERM/ENOENT and behave as if the current user is the only user running processes. pstree shows the process subtree which contains "pstree" process. Note: the patch doesn't deal with setuid/setgid issues of keeping preopened descriptors of procfs files (like https://lkml.org/lkml/2011/2/7/368). We rely on that the leaked information like the scheduling counters of setuid apps doesn't threaten anybody's privacy - only the user started the setuid program may read the counters. Signed-off-by: Vasiliy Kulikov Cc: Alexey Dobriyan Cc: Al Viro Cc: Randy Dunlap Cc: "H. Peter Anvin" Cc: Greg KH Cc: Theodore Tso Cc: Alan Cox Cc: James Morris Cc: Oleg Nesterov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 39 +++++++++++++++++ fs/proc/base.c | 69 +++++++++++++++++++++++++++++- fs/proc/inode.c | 8 ++++ fs/proc/root.c | 21 +++++++-- include/linux/pid_namespace.h | 2 + 5 files changed, 135 insertions(+), 4 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 0ec91f03422e5b..12fee132fbe2e3 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -41,6 +41,8 @@ Table of Contents 3.5 /proc//mountinfo - Information about mounts 3.6 /proc//comm & /proc//task//comm + 4 Configuring procfs + 4.1 Mount options ------------------------------------------------------------------------------ Preface @@ -1542,3 +1544,40 @@ a task to set its own or one of its thread siblings comm value. The comm value is limited in size compared to the cmdline value, so writing anything longer then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated comm value. + + +------------------------------------------------------------------------------ +Configuring procfs +------------------------------------------------------------------------------ + +4.1 Mount options +--------------------- + +The following mount options are supported: + + hidepid= Set /proc// access mode. + gid= Set the group authorized to learn processes information. + +hidepid=0 means classic mode - everybody may access all /proc// directories +(default). + +hidepid=1 means users may not access any /proc// directories but their +own. Sensitive files like cmdline, sched*, status are now protected against +other users. This makes it impossible to learn whether any user runs +specific program (given the program doesn't reveal itself by its behaviour). +As an additional bonus, as /proc//cmdline is unaccessible for other users, +poorly written programs passing sensitive information via program arguments are +now protected against local eavesdroppers. + +hidepid=2 means hidepid=1 plus all /proc// will be fully invisible to other +users. It doesn't mean that it hides a fact whether a process with a specific +pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"), +but it hides process' uid and gid, which may be learned by stat()'ing +/proc// otherwise. It greatly complicates an intruder's task of gathering +information about running processes, whether some daemon runs with elevated +privileges, whether other user runs some sensitive program, whether other users +run any program at all, etc. + +gid= defines a group authorized to learn processes information otherwise +prohibited by hidepid=. If you use some daemon like identd which needs to learn +information about processes information, just add identd to this group. diff --git a/fs/proc/base.c b/fs/proc/base.c index 4d755fed3ecbe0..8173dfd89cb262 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -631,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr) return 0; } +/* + * May current process learn task's sched/cmdline info (for hide_pid_min=1) + * or euid/egid (for hide_pid_min=2)? + */ +static bool has_pid_permissions(struct pid_namespace *pid, + struct task_struct *task, + int hide_pid_min) +{ + if (pid->hide_pid < hide_pid_min) + return true; + if (in_group_p(pid->pid_gid)) + return true; + return ptrace_may_access(task, PTRACE_MODE_READ); +} + + +static int proc_pid_permission(struct inode *inode, int mask) +{ + struct pid_namespace *pid = inode->i_sb->s_fs_info; + struct task_struct *task; + bool has_perms; + + task = get_proc_task(inode); + has_perms = has_pid_permissions(pid, task, 1); + put_task_struct(task); + + if (!has_perms) { + if (pid->hide_pid == 2) { + /* + * Let's make getdents(), stat(), and open() + * consistent with each other. If a process + * may not stat() a file, it shouldn't be seen + * in procfs at all. + */ + return -ENOENT; + } + + return -EPERM; + } + return generic_permission(inode, mask); +} + + + static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; @@ -1615,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) struct inode *inode = dentry->d_inode; struct task_struct *task; const struct cred *cred; + struct pid_namespace *pid = dentry->d_sb->s_fs_info; generic_fillattr(inode, stat); @@ -1623,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) stat->gid = 0; task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { + if (!has_pid_permissions(pid, task, 2)) { + rcu_read_unlock(); + /* + * This doesn't prevent learning whether PID exists, + * it only makes getattr() consistent with readdir(). + */ + return -ENOENT; + } if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || task_dumpable(task)) { cred = __task_cred(task); @@ -3119,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, .getattr = pid_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) @@ -3322,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi proc_pid_instantiate, iter.task, NULL); } +static int fake_filldir(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned d_type) +{ + return 0; +} + /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { @@ -3329,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) struct task_struct *reaper; struct tgid_iter iter; struct pid_namespace *ns; + filldir_t __filldir; if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET) goto out_no_task; @@ -3350,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) for (iter = next_tgid(ns, iter); iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { + if (has_pid_permissions(ns, iter.task, 2)) + __filldir = filldir; + else + __filldir = fake_filldir; + filp->f_pos = iter.tgid + TGID_OFFSET; - if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) { + if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) { put_task_struct(iter.task); goto out; } @@ -3686,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, .getattr = proc_task_getattr, .setattr = proc_setattr, + .permission = proc_pid_permission, }; static const struct file_operations proc_task_operations = { diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 27c762f34870a2..84fd3235a5902b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -106,6 +106,14 @@ void __init proc_init_inodecache(void) static int proc_show_options(struct seq_file *seq, struct dentry *root) { + struct super_block *sb = root->d_sb; + struct pid_namespace *pid = sb->s_fs_info; + + if (pid->pid_gid) + seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid); + if (pid->hide_pid != 0) + seq_printf(seq, ",hidepid=%u", pid->hide_pid); + return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 6a8ac1d361a923..46a15d8a29ca74 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,10 +38,12 @@ static int proc_set_super(struct super_block *sb, void *data) } enum { - Opt_err, + Opt_gid, Opt_hidepid, Opt_err, }; static const match_table_t tokens = { + {Opt_hidepid, "hidepid=%u"}, + {Opt_gid, "gid=%u"}, {Opt_err, NULL}, }; @@ -49,8 +51,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) { char *p; substring_t args[MAX_OPT_ARGS]; - - pr_debug("proc: options = %s\n", options); + int option; if (!options) return 1; @@ -63,6 +64,20 @@ static int proc_parse_options(char *options, struct pid_namespace *pid) args[0].to = args[0].from = 0; token = match_token(p, tokens, args); switch (token) { + case Opt_gid: + if (match_int(&args[0], &option)) + return 0; + pid->pid_gid = option; + break; + case Opt_hidepid: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 2) { + pr_err("proc: hidepid value must be between 0 and 2.\n"); + return 0; + } + pid->hide_pid = option; + break; default: pr_err("proc: unrecognized mount option \"%s\" " "or missing value\n", p); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 38d10326246afb..e7cf6669ac3482 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -30,6 +30,8 @@ struct pid_namespace { #ifdef CONFIG_BSD_PROCESS_ACCT struct bsd_acct_struct *bacct; #endif + gid_t pid_gid; + int hide_pid; }; extern struct pid_namespace init_pid_ns; From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 10 Jan 2012 15:11:35 -0800 Subject: [PATCH 123/124] workqueue: make alloc_workqueue() take printf fmt and args for name alloc_workqueue() currently expects the passed in @name pointer to remain accessible. This is inconvenient and a bit silly given that the whole wq is being dynamically allocated. This patch updates alloc_workqueue() and friends to take printf format string instead of opaque string and matching varargs at the end. The name is allocated together with the wq and formatted. alloc_ordered_workqueue() is converted to a macro to unify varargs handling with alloc_workqueue(), and, while at it, add comment to alloc_workqueue(). None of the current in-kernel users pass in string with '%' as constant name and this change shouldn't cause any problem. [akpm@linux-foundation.org: use __printf] Signed-off-by: Tejun Heo Suggested-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/workqueue.h | 47 ++++++++++++++++++++++++++------------- kernel/workqueue.c | 32 +++++++++++++++++--------- 2 files changed, 53 insertions(+), 26 deletions(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 0d556deb497b99..eb8b9f15f2e03b 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct * -__alloc_workqueue_key(const char *name, unsigned int flags, int max_active, - struct lock_class_key *key, const char *lock_name); +__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, + struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6); +/** + * alloc_workqueue - allocate a workqueue + * @fmt: printf format for the name of the workqueue + * @flags: WQ_* flags + * @max_active: max in-flight work items, 0 for default + * @args: args for @fmt + * + * Allocate a workqueue with the specified parameters. For detailed + * information on WQ_* flags, please refer to Documentation/workqueue.txt. + * + * The __lock_name macro dance is to guarantee that single lock_class_key + * doesn't end up with different namesm, which isn't allowed by lockdep. + * + * RETURNS: + * Pointer to the allocated workqueue on success, %NULL on failure. + */ #ifdef CONFIG_LOCKDEP -#define alloc_workqueue(name, flags, max_active) \ +#define alloc_workqueue(fmt, flags, max_active, args...) \ ({ \ static struct lock_class_key __key; \ const char *__lock_name; \ \ - if (__builtin_constant_p(name)) \ - __lock_name = (name); \ + if (__builtin_constant_p(fmt)) \ + __lock_name = (fmt); \ else \ - __lock_name = #name; \ + __lock_name = #fmt; \ \ - __alloc_workqueue_key((name), (flags), (max_active), \ - &__key, __lock_name); \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + &__key, __lock_name, ##args); \ }) #else -#define alloc_workqueue(name, flags, max_active) \ - __alloc_workqueue_key((name), (flags), (max_active), NULL, NULL) +#define alloc_workqueue(fmt, flags, max_active, args...) \ + __alloc_workqueue_key((fmt), (flags), (max_active), \ + NULL, NULL, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue - * @name: name of the workqueue + * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) + * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are @@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active, * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ -static inline struct workqueue_struct * -alloc_ordered_workqueue(const char *name, unsigned int flags) -{ - return alloc_workqueue(name, WQ_UNBOUND | flags, 1); -} +#define alloc_ordered_workqueue(fmt, flags, args...) \ + alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue((name), WQ_MEM_RECLAIM, 1) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 42fa9ad0a81048..bec7b5b53e03db 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -242,10 +242,10 @@ struct workqueue_struct { int nr_drainers; /* W: drain in progress */ int saved_max_active; /* W: saved cwq max_active */ - const char *name; /* I: workqueue name */ #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif + char name[]; /* I: workqueue name */ }; struct workqueue_struct *system_wq __read_mostly; @@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } -struct workqueue_struct *__alloc_workqueue_key(const char *name, +struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, struct lock_class_key *key, - const char *lock_name) + const char *lock_name, ...) { + va_list args, args1; struct workqueue_struct *wq; unsigned int cpu; + size_t namelen; + + /* determine namelen, allocate wq and format name */ + va_start(args, lock_name); + va_copy(args1, args); + namelen = vsnprintf(NULL, 0, fmt, args) + 1; + + wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + if (!wq) + goto err; + + vsnprintf(wq->name, namelen, fmt, args1); + va_end(args); + va_end(args1); /* * Workqueues which may be used during memory reclaim should @@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, flags |= WQ_HIGHPRI; max_active = max_active ?: WQ_DFL_ACTIVE; - max_active = wq_clamp_max_active(max_active, flags, name); - - wq = kzalloc(sizeof(*wq), GFP_KERNEL); - if (!wq) - goto err; + max_active = wq_clamp_max_active(max_active, flags, wq->name); + /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; mutex_init(&wq->flush_mutex); @@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); - wq->name = name; lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); @@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name, if (!rescuer) goto err; - rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + rescuer->task = kthread_create(rescuer_thread, wq, "%s", + wq->name); if (IS_ERR(rescuer->task)) goto err; From 6b550f9495947fc279d12c38feaf98500e8d0646 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Tue, 10 Jan 2012 15:11:37 -0800 Subject: [PATCH 124/124] user namespace: make signal.c respect user namespaces ipc/mqueue.c: for __SI_MESQ, convert the uid being sent to recipient's user namespace. (new, thanks Oleg) __send_signal: convert current's uid to the recipient's user namespace for any siginfo which is not SI_FROMKERNEL (patch from Oleg, thanks again :) do_notify_parent and do_notify_parent_cldstop: map task's uid to parent's user namespace ptrace_signal maps parent's uid into current's user namespace before including in signal to current. IIUC Oleg has argued that this shouldn't matter as the debugger will play with it, but it seems like not converting the value currently being set is misleading. Changelog: Sep 20: Inspired by Oleg's suggestion, define map_cred_ns() helper to simplify callers and help make clear what we are translating (which uid into which namespace). Passing the target task would make callers even easier to read, but we pass in user_ns because current_user_ns() != task_cred_xxx(current, user_ns). Sep 20: As recommended by Oleg, also put task_pid_vnr() under rcu_read_lock in ptrace_signal(). Sep 23: In send_signal(), detect when (user) signal is coming from an ancestor or unrelated user namespace. Pass that on to __send_signal, which sets si_uid to 0 or overflowuid if needed. Oct 12: Base on Oleg's fixup_uid() patch. On top of that, handle all SI_FROMKERNEL cases at callers, because we can't assume sender is current in those cases. Nov 10: (mhelsley) rename fixup_uid to more meaningful usern_fixup_signal_uid Nov 10: (akpm) make the !CONFIG_USER_NS case clearer Signed-off-by: Serge Hallyn Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" From: Serge Hallyn Subject: __send_signal: pass q->info, not info, to userns_fixup_signal_uid (v2) Eric Biederman pointed out that passing info is a bug and could lead to a NULL pointer deref to boot. A collection of signal, securebits, filecaps, cap_bounds, and a few other ltp tests passed with this kernel. Changelog: Nov 18: previous patch missed a leading '&' Signed-off-by: Serge Hallyn Cc: "Eric W. Biederman" From: Dan Carpenter Subject: ipc/mqueue: lock() => unlock() typo There was a double lock typo introduced in b085f4bd6b21 "user namespace: make signal.c respect user namespaces" Signed-off-by: Dan Carpenter Cc: Oleg Nesterov Cc: Matt Helsley Cc: "Eric W. Biederman" Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- ipc/mqueue.c | 7 ++++++- kernel/signal.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9a142a290749f2..9b7c8ab7d75cad 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -542,9 +543,13 @@ static void __do_notify(struct mqueue_inode_info *info) sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; + /* map current pid/uid into info->owner's namespaces */ + rcu_read_lock(); sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); - sig_i.si_uid = current_uid(); + sig_i.si_uid = user_ns_map_uid(info->user->user_ns, + current_cred(), current_uid()); + rcu_read_unlock(); kill_pid_info(info->notify.sigev_signo, &sig_i, info->notify_owner); diff --git a/kernel/signal.c b/kernel/signal.c index d532f1709fbf44..c73c4284160e1e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -28,6 +28,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -1019,6 +1020,34 @@ static inline int legacy_queue(struct sigpending *signals, int sig) return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); } +/* + * map the uid in struct cred into user namespace *ns + */ +static inline uid_t map_cred_ns(const struct cred *cred, + struct user_namespace *ns) +{ + return user_ns_map_uid(ns, cred, cred->uid); +} + +#ifdef CONFIG_USER_NS +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + if (current_user_ns() == task_cred_xxx(t, user_ns)) + return; + + if (SI_FROMKERNEL(info)) + return; + + info->si_uid = user_ns_map_uid(task_cred_xxx(t, user_ns), + current_cred(), info->si_uid); +} +#else +static inline void userns_fixup_signal_uid(struct siginfo *info, struct task_struct *t) +{ + return; +} +#endif + static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, int group, int from_ancestor_ns) { @@ -1088,6 +1117,9 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_pid = 0; break; } + + userns_fixup_signal_uid(&q->info, t); + } else if (!is_si_special(info)) { if (sig >= SIGRTMIN && info->si_code != SI_USER) { /* @@ -1626,7 +1658,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig) */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(tsk->parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime); @@ -1709,7 +1742,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, */ rcu_read_lock(); info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); - info.si_uid = __task_cred(tsk)->uid; + info.si_uid = map_cred_ns(__task_cred(tsk), + task_cred_xxx(parent, user_ns)); rcu_read_unlock(); info.si_utime = cputime_to_clock_t(tsk->utime); @@ -2125,8 +2159,11 @@ static int ptrace_signal(int signr, siginfo_t *info, info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; + rcu_read_lock(); info->si_pid = task_pid_vnr(current->parent); - info->si_uid = task_uid(current->parent); + info->si_uid = map_cred_ns(__task_cred(current->parent), + current_user_ns()); + rcu_read_unlock(); } /* If the (new) signal is now blocked, requeue it. */