@@ -78,6 +78,157 @@ xlog_cil_init_post_recovery(
78
78
log -> l_cilp -> xc_ctx -> sequence = 1 ;
79
79
}
80
80
81
+ static inline int
82
+ xlog_cil_iovec_space (
83
+ uint niovecs )
84
+ {
85
+ return round_up ((sizeof (struct xfs_log_vec ) +
86
+ niovecs * sizeof (struct xfs_log_iovec )),
87
+ sizeof (uint64_t ));
88
+ }
89
+
90
+ /*
91
+ * Allocate or pin log vector buffers for CIL insertion.
92
+ *
93
+ * The CIL currently uses disposable buffers for copying a snapshot of the
94
+ * modified items into the log during a push. The biggest problem with this is
95
+ * the requirement to allocate the disposable buffer during the commit if:
96
+ * a) does not exist; or
97
+ * b) it is too small
98
+ *
99
+ * If we do this allocation within xlog_cil_insert_format_items(), it is done
100
+ * under the xc_ctx_lock, which means that a CIL push cannot occur during
101
+ * the memory allocation. This means that we have a potential deadlock situation
102
+ * under low memory conditions when we have lots of dirty metadata pinned in
103
+ * the CIL and we need a CIL commit to occur to free memory.
104
+ *
105
+ * To avoid this, we need to move the memory allocation outside the
106
+ * xc_ctx_lock, but because the log vector buffers are disposable, that opens
107
+ * up a TOCTOU race condition w.r.t. the CIL committing and removing the log
108
+ * vector buffers between the check and the formatting of the item into the
109
+ * log vector buffer within the xc_ctx_lock.
110
+ *
111
+ * Because the log vector buffer needs to be unchanged during the CIL push
112
+ * process, we cannot share the buffer between the transaction commit (which
113
+ * modifies the buffer) and the CIL push context that is writing the changes
114
+ * into the log. This means skipping preallocation of buffer space is
115
+ * unreliable, but we most definitely do not want to be allocating and freeing
116
+ * buffers unnecessarily during commits when overwrites can be done safely.
117
+ *
118
+ * The simplest solution to this problem is to allocate a shadow buffer when a
119
+ * log item is committed for the second time, and then to only use this buffer
120
+ * if necessary. The buffer can remain attached to the log item until such time
121
+ * it is needed, and this is the buffer that is reallocated to match the size of
122
+ * the incoming modification. Then during the formatting of the item we can swap
123
+ * the active buffer with the new one if we can't reuse the existing buffer. We
124
+ * don't free the old buffer as it may be reused on the next modification if
125
+ * it's size is right, otherwise we'll free and reallocate it at that point.
126
+ *
127
+ * This function builds a vector for the changes in each log item in the
128
+ * transaction. It then works out the length of the buffer needed for each log
129
+ * item, allocates them and attaches the vector to the log item in preparation
130
+ * for the formatting step which occurs under the xc_ctx_lock.
131
+ *
132
+ * While this means the memory footprint goes up, it avoids the repeated
133
+ * alloc/free pattern that repeated modifications of an item would otherwise
134
+ * cause, and hence minimises the CPU overhead of such behaviour.
135
+ */
136
+ static void
137
+ xlog_cil_alloc_shadow_bufs (
138
+ struct xlog * log ,
139
+ struct xfs_trans * tp )
140
+ {
141
+ struct xfs_log_item_desc * lidp ;
142
+
143
+ list_for_each_entry (lidp , & tp -> t_items , lid_trans ) {
144
+ struct xfs_log_item * lip = lidp -> lid_item ;
145
+ struct xfs_log_vec * lv ;
146
+ int niovecs = 0 ;
147
+ int nbytes = 0 ;
148
+ int buf_size ;
149
+ bool ordered = false;
150
+
151
+ /* Skip items which aren't dirty in this transaction. */
152
+ if (!(lidp -> lid_flags & XFS_LID_DIRTY ))
153
+ continue ;
154
+
155
+ /* get number of vecs and size of data to be stored */
156
+ lip -> li_ops -> iop_size (lip , & niovecs , & nbytes );
157
+
158
+ /*
159
+ * Ordered items need to be tracked but we do not wish to write
160
+ * them. We need a logvec to track the object, but we do not
161
+ * need an iovec or buffer to be allocated for copying data.
162
+ */
163
+ if (niovecs == XFS_LOG_VEC_ORDERED ) {
164
+ ordered = true;
165
+ niovecs = 0 ;
166
+ nbytes = 0 ;
167
+ }
168
+
169
+ /*
170
+ * We 64-bit align the length of each iovec so that the start
171
+ * of the next one is naturally aligned. We'll need to
172
+ * account for that slack space here. Then round nbytes up
173
+ * to 64-bit alignment so that the initial buffer alignment is
174
+ * easy to calculate and verify.
175
+ */
176
+ nbytes += niovecs * sizeof (uint64_t );
177
+ nbytes = round_up (nbytes , sizeof (uint64_t ));
178
+
179
+ /*
180
+ * The data buffer needs to start 64-bit aligned, so round up
181
+ * that space to ensure we can align it appropriately and not
182
+ * overrun the buffer.
183
+ */
184
+ buf_size = nbytes + xlog_cil_iovec_space (niovecs );
185
+
186
+ /*
187
+ * if we have no shadow buffer, or it is too small, we need to
188
+ * reallocate it.
189
+ */
190
+ if (!lip -> li_lv_shadow ||
191
+ buf_size > lip -> li_lv_shadow -> lv_size ) {
192
+
193
+ /*
194
+ * We free and allocate here as a realloc would copy
195
+ * unecessary data. We don't use kmem_zalloc() for the
196
+ * same reason - we don't need to zero the data area in
197
+ * the buffer, only the log vector header and the iovec
198
+ * storage.
199
+ */
200
+ kmem_free (lip -> li_lv_shadow );
201
+
202
+ lv = kmem_alloc (buf_size , KM_SLEEP |KM_NOFS );
203
+ memset (lv , 0 , xlog_cil_iovec_space (niovecs ));
204
+
205
+ lv -> lv_item = lip ;
206
+ lv -> lv_size = buf_size ;
207
+ if (ordered )
208
+ lv -> lv_buf_len = XFS_LOG_VEC_ORDERED ;
209
+ else
210
+ lv -> lv_iovecp = (struct xfs_log_iovec * )& lv [1 ];
211
+ lip -> li_lv_shadow = lv ;
212
+ } else {
213
+ /* same or smaller, optimise common overwrite case */
214
+ lv = lip -> li_lv_shadow ;
215
+ if (ordered )
216
+ lv -> lv_buf_len = XFS_LOG_VEC_ORDERED ;
217
+ else
218
+ lv -> lv_buf_len = 0 ;
219
+ lv -> lv_bytes = 0 ;
220
+ lv -> lv_next = NULL ;
221
+ }
222
+
223
+ /* Ensure the lv is set up according to ->iop_size */
224
+ lv -> lv_niovecs = niovecs ;
225
+
226
+ /* The allocated data region lies beyond the iovec region */
227
+ lv -> lv_buf = (char * )lv + xlog_cil_iovec_space (niovecs );
228
+ }
229
+
230
+ }
231
+
81
232
/*
82
233
* Prepare the log item for insertion into the CIL. Calculate the difference in
83
234
* log space and vectors it will consume, and if it is a new item pin it as
@@ -100,16 +251,19 @@ xfs_cil_prepare_item(
100
251
/*
101
252
* If there is no old LV, this is the first time we've seen the item in
102
253
* this CIL context and so we need to pin it. If we are replacing the
103
- * old_lv, then remove the space it accounts for and free it.
254
+ * old_lv, then remove the space it accounts for and make it the shadow
255
+ * buffer for later freeing. In both cases we are now switching to the
256
+ * shadow buffer, so update the the pointer to it appropriately.
104
257
*/
105
- if (!old_lv )
258
+ if (!old_lv ) {
106
259
lv -> lv_item -> li_ops -> iop_pin (lv -> lv_item );
107
- else if (old_lv != lv ) {
260
+ lv -> lv_item -> li_lv_shadow = NULL ;
261
+ } else if (old_lv != lv ) {
108
262
ASSERT (lv -> lv_buf_len != XFS_LOG_VEC_ORDERED );
109
263
110
264
* diff_len -= old_lv -> lv_bytes ;
111
265
* diff_iovecs -= old_lv -> lv_niovecs ;
112
- kmem_free ( old_lv ) ;
266
+ lv -> lv_item -> li_lv_shadow = old_lv ;
113
267
}
114
268
115
269
/* attach new log vector to log item */
@@ -133,11 +287,13 @@ xfs_cil_prepare_item(
133
287
* write it out asynchronously without needing to relock the object that was
134
288
* modified at the time it gets written into the iclog.
135
289
*
136
- * This function builds a vector for the changes in each log item in the
137
- * transaction. It then works out the length of the buffer needed for each log
138
- * item, allocates them and formats the vector for the item into the buffer.
139
- * The buffer is then attached to the log item are then inserted into the
140
- * Committed Item List for tracking until the next checkpoint is written out.
290
+ * This function takes the prepared log vectors attached to each log item, and
291
+ * formats the changes into the log vector buffer. The buffer it uses is
292
+ * dependent on the current state of the vector in the CIL - the shadow lv is
293
+ * guaranteed to be large enough for the current modification, but we will only
294
+ * use that if we can't reuse the existing lv. If we can't reuse the existing
295
+ * lv, then simple swap it out for the shadow lv. We don't free it - that is
296
+ * done lazily either by th enext modification or the freeing of the log item.
141
297
*
142
298
* We don't set up region headers during this process; we simply copy the
143
299
* regions into the flat buffer. We can do this because we still have to do a
@@ -170,59 +326,29 @@ xlog_cil_insert_format_items(
170
326
list_for_each_entry (lidp , & tp -> t_items , lid_trans ) {
171
327
struct xfs_log_item * lip = lidp -> lid_item ;
172
328
struct xfs_log_vec * lv ;
173
- struct xfs_log_vec * old_lv ;
174
- int niovecs = 0 ;
175
- int nbytes = 0 ;
176
- int buf_size ;
329
+ struct xfs_log_vec * old_lv = NULL ;
330
+ struct xfs_log_vec * shadow ;
177
331
bool ordered = false;
178
332
179
333
/* Skip items which aren't dirty in this transaction. */
180
334
if (!(lidp -> lid_flags & XFS_LID_DIRTY ))
181
335
continue ;
182
336
183
- /* get number of vecs and size of data to be stored */
184
- lip -> li_ops -> iop_size (lip , & niovecs , & nbytes );
185
-
186
- /* Skip items that do not have any vectors for writing */
187
- if (!niovecs )
188
- continue ;
189
-
190
337
/*
191
- * Ordered items need to be tracked but we do not wish to write
192
- * them. We need a logvec to track the object, but we do not
193
- * need an iovec or buffer to be allocated for copying data.
338
+ * The formatting size information is already attached to
339
+ * the shadow lv on the log item.
194
340
*/
195
- if (niovecs == XFS_LOG_VEC_ORDERED ) {
341
+ shadow = lip -> li_lv_shadow ;
342
+ if (shadow -> lv_buf_len == XFS_LOG_VEC_ORDERED )
196
343
ordered = true;
197
- niovecs = 0 ;
198
- nbytes = 0 ;
199
- }
200
344
201
- /*
202
- * We 64-bit align the length of each iovec so that the start
203
- * of the next one is naturally aligned. We'll need to
204
- * account for that slack space here. Then round nbytes up
205
- * to 64-bit alignment so that the initial buffer alignment is
206
- * easy to calculate and verify.
207
- */
208
- nbytes += niovecs * sizeof (uint64_t );
209
- nbytes = round_up (nbytes , sizeof (uint64_t ));
210
-
211
- /* grab the old item if it exists for reservation accounting */
212
- old_lv = lip -> li_lv ;
213
-
214
- /*
215
- * The data buffer needs to start 64-bit aligned, so round up
216
- * that space to ensure we can align it appropriately and not
217
- * overrun the buffer.
218
- */
219
- buf_size = nbytes +
220
- round_up ((sizeof (struct xfs_log_vec ) +
221
- niovecs * sizeof (struct xfs_log_iovec )),
222
- sizeof (uint64_t ));
345
+ /* Skip items that do not have any vectors for writing */
346
+ if (!shadow -> lv_niovecs && !ordered )
347
+ continue ;
223
348
224
349
/* compare to existing item size */
225
- if (lip -> li_lv && buf_size <= lip -> li_lv -> lv_size ) {
350
+ old_lv = lip -> li_lv ;
351
+ if (lip -> li_lv && shadow -> lv_size <= lip -> li_lv -> lv_size ) {
226
352
/* same or smaller, optimise common overwrite case */
227
353
lv = lip -> li_lv ;
228
354
lv -> lv_next = NULL ;
@@ -236,32 +362,29 @@ xlog_cil_insert_format_items(
236
362
*/
237
363
* diff_iovecs -= lv -> lv_niovecs ;
238
364
* diff_len -= lv -> lv_bytes ;
365
+
366
+ /* Ensure the lv is set up according to ->iop_size */
367
+ lv -> lv_niovecs = shadow -> lv_niovecs ;
368
+
369
+ /* reset the lv buffer information for new formatting */
370
+ lv -> lv_buf_len = 0 ;
371
+ lv -> lv_bytes = 0 ;
372
+ lv -> lv_buf = (char * )lv +
373
+ xlog_cil_iovec_space (lv -> lv_niovecs );
239
374
} else {
240
- /* allocate new data chunk */
241
- lv = kmem_zalloc ( buf_size , KM_SLEEP | KM_NOFS ) ;
375
+ /* switch to shadow buffer! */
376
+ lv = shadow ;
242
377
lv -> lv_item = lip ;
243
- lv -> lv_size = buf_size ;
244
378
if (ordered ) {
245
379
/* track as an ordered logvec */
246
380
ASSERT (lip -> li_lv == NULL );
247
- lv -> lv_buf_len = XFS_LOG_VEC_ORDERED ;
248
381
goto insert ;
249
382
}
250
- lv -> lv_iovecp = (struct xfs_log_iovec * )& lv [1 ];
251
383
}
252
384
253
- /* Ensure the lv is set up according to ->iop_size */
254
- lv -> lv_niovecs = niovecs ;
255
-
256
- /* The allocated data region lies beyond the iovec region */
257
- lv -> lv_buf_len = 0 ;
258
- lv -> lv_bytes = 0 ;
259
- lv -> lv_buf = (char * )lv + buf_size - nbytes ;
260
385
ASSERT (IS_ALIGNED ((unsigned long )lv -> lv_buf , sizeof (uint64_t )));
261
-
262
386
lip -> li_ops -> iop_format (lip , lv );
263
387
insert :
264
- ASSERT (lv -> lv_buf_len <= nbytes );
265
388
xfs_cil_prepare_item (log , lv , old_lv , diff_len , diff_iovecs );
266
389
}
267
390
}
@@ -783,6 +906,13 @@ xfs_log_commit_cil(
783
906
struct xlog * log = mp -> m_log ;
784
907
struct xfs_cil * cil = log -> l_cilp ;
785
908
909
+ /*
910
+ * Do all necessary memory allocation before we lock the CIL.
911
+ * This ensures the allocation does not deadlock with a CIL
912
+ * push in memory reclaim (e.g. from kswapd).
913
+ */
914
+ xlog_cil_alloc_shadow_bufs (log , tp );
915
+
786
916
/* lock out background commit */
787
917
down_read (& cil -> xc_ctx_lock );
788
918
0 commit comments