7
7
#include "delta.h"
8
8
#include "count-delta.h"
9
9
10
- static int very_different (struct diff_filespec * src ,
11
- struct diff_filespec * dst ,
12
- int min_score )
10
+ static int should_break (struct diff_filespec * src ,
11
+ struct diff_filespec * dst ,
12
+ int break_score ,
13
+ int * merge_score_p )
13
14
{
14
15
/* dst is recorded as a modification of src. Are they so
15
16
* different that we are better off recording this as a pair
16
- * of delete and create? min_score is the minimum amount of
17
- * new material that must exist in the dst and not in src for
18
- * the pair to be considered a complete rewrite, and recommended
19
- * to be set to a very high value, 99% or so.
17
+ * of delete and create?
20
18
*
21
- * The value we return represents the amount of new material
22
- * that is in dst and not in src. We return 0 when we do not
23
- * want to get the filepair broken.
19
+ * There are two criteria used in this algorithm. For the
20
+ * purposes of helping later rename/copy, we take both delete
21
+ * and insert into account and estimate the amount of "edit".
22
+ * If the edit is very large, we break this pair so that
23
+ * rename/copy can pick the pieces up to match with other
24
+ * files.
25
+ *
26
+ * On the other hand, we would want to ignore inserts for the
27
+ * pure "complete rewrite" detection. As long as most of the
28
+ * existing contents were removed from the file, it is a
29
+ * complete rewrite, and if sizable chunk from the original
30
+ * still remains in the result, it is not a rewrite. It does
31
+ * not matter how much or how little new material is added to
32
+ * the file.
33
+ *
34
+ * The score we leave for such a broken filepair uses the
35
+ * latter definition so that later clean-up stage can find the
36
+ * pieces that should not have been broken according to the
37
+ * latter definition after rename/copy runs, and merge the
38
+ * broken pair that have a score lower than given criteria
39
+ * back together. The break operation itself happens
40
+ * according to the former definition.
41
+ *
42
+ * The minimum_edit parameter tells us when to break (the
43
+ * amount of "edit" required for us to consider breaking the
44
+ * pair). We leave the amount of deletion in *merge_score_p
45
+ * when we return.
46
+ *
47
+ * The value we return is 1 if we want the pair to be broken,
48
+ * or 0 if we do not.
24
49
*/
25
50
void * delta ;
26
51
unsigned long delta_size , base_size , src_copied , literal_added ;
52
+ int to_break = 0 ;
53
+
54
+ * merge_score_p = 0 ; /* assume no deletion --- "do not break"
55
+ * is the default.
56
+ */
27
57
28
58
if (!S_ISREG (src -> mode ) || !S_ISREG (dst -> mode ))
29
59
return 0 ; /* leave symlink rename alone */
30
60
31
- if (diff_populate_filespec (src , 1 ) || diff_populate_filespec (dst , 1 ))
61
+ if (diff_populate_filespec (src , 0 ) || diff_populate_filespec (dst , 0 ))
32
62
return 0 ; /* error but caught downstream */
33
63
34
64
delta_size = ((src -> size < dst -> size ) ?
@@ -40,53 +70,95 @@ static int very_different(struct diff_filespec *src,
40
70
*/
41
71
base_size = ((src -> size < dst -> size ) ? dst -> size : src -> size );
42
72
43
- /*
44
- * If file size difference is too big compared to the
45
- * base_size, we declare this a complete rewrite.
46
- */
47
- if (base_size * min_score < delta_size * MAX_SCORE )
48
- return MAX_SCORE ;
49
-
50
- if (diff_populate_filespec (src , 0 ) || diff_populate_filespec (dst , 0 ))
51
- return 0 ; /* error but caught downstream */
52
-
53
73
delta = diff_delta (src -> data , src -> size ,
54
74
dst -> data , dst -> size ,
55
75
& delta_size );
56
76
57
- /* A delta that has a lot of literal additions would have
58
- * big delta_size no matter what else it does.
59
- */
60
- if (base_size * min_score < delta_size * MAX_SCORE )
61
- return MAX_SCORE ;
62
-
63
77
/* Estimate the edit size by interpreting delta. */
64
- if (count_delta (delta , delta_size , & src_copied , & literal_added )) {
78
+ if (count_delta (delta , delta_size ,
79
+ & src_copied , & literal_added )) {
65
80
free (delta );
66
- return 0 ;
81
+ return 0 ; /* we cannot tell */
67
82
}
68
83
free (delta );
69
84
70
- /* Extent of damage */
71
- if (src -> size + literal_added < src_copied )
72
- delta_size = 0 ;
85
+ /* Compute merge-score, which is "how much is removed
86
+ * from the source material". The clean-up stage will
87
+ * merge the surviving pair together if the score is
88
+ * less than the minimum, after rename/copy runs.
89
+ */
90
+ if (src -> size <= src_copied )
91
+ delta_size = 0 ; /* avoid wrapping around */
92
+ else
93
+ delta_size = src -> size - src_copied ;
94
+ * merge_score_p = delta_size * MAX_SCORE / src -> size ;
95
+
96
+ /* Extent of damage, which counts both inserts and
97
+ * deletes.
98
+ */
99
+ if (src -> size + literal_added <= src_copied )
100
+ delta_size = 0 ; /* avoid wrapping around */
73
101
else
74
102
delta_size = (src -> size - src_copied ) + literal_added ;
103
+
104
+ /* We break if the edit exceeds the minimum.
105
+ * i.e. (break_score / MAX_SCORE < delta_size / base_size)
106
+ */
107
+ if (break_score * base_size < delta_size * MAX_SCORE )
108
+ to_break = 1 ;
75
109
76
- if (base_size < delta_size )
77
- return MAX_SCORE ;
78
-
79
- return delta_size * MAX_SCORE / base_size ;
110
+ return to_break ;
80
111
}
81
112
82
- void diffcore_break (int min_score )
113
+ void diffcore_break (int break_score )
83
114
{
84
115
struct diff_queue_struct * q = & diff_queued_diff ;
85
116
struct diff_queue_struct outq ;
117
+
118
+ /* When the filepair has this much edit (insert and delete),
119
+ * it is first considered to be a rewrite and broken into a
120
+ * create and delete filepair. This is to help breaking a
121
+ * file that had too much new stuff added, possibly from
122
+ * moving contents from another file, so that rename/copy can
123
+ * match it with the other file.
124
+ *
125
+ * int break_score; we reuse incoming parameter for this.
126
+ */
127
+
128
+ /* After a pair is broken according to break_score and
129
+ * subjected to rename/copy, both of them may survive intact,
130
+ * due to lack of suitable rename/copy peer. Or, the caller
131
+ * may be calling us without using rename/copy. When that
132
+ * happens, we merge the broken pieces back into one
133
+ * modification together if the pair did not have more than
134
+ * this much delete. For this computation, we do not take
135
+ * insert into account at all. If you start from a 100-line
136
+ * file and delete 97 lines of it, it does not matter if you
137
+ * add 27 lines to it to make a new 30-line file or if you add
138
+ * 997 lines to it to make a 1000-line file. Either way what
139
+ * you did was a rewrite of 97%. On the other hand, if you
140
+ * delete 3 lines, keeping 97 lines intact, it does not matter
141
+ * if you add 3 lines to it to make a new 100-line file or if
142
+ * you add 903 lines to it to make a new 1000-line file.
143
+ * Either way you did a lot of additions and not a rewrite.
144
+ * This merge happens to catch the latter case. A merge_score
145
+ * of 80% would be a good default value (a broken pair that
146
+ * has score lower than merge_score will be merged back
147
+ * together).
148
+ */
149
+ int merge_score ;
86
150
int i ;
87
151
88
- if (!min_score )
89
- min_score = DEFAULT_BREAK_SCORE ;
152
+ /* See comment on DEFAULT_BREAK_SCORE and
153
+ * DEFAULT_MERGE_SCORE in diffcore.h
154
+ */
155
+ merge_score = (break_score >> 16 ) & 0xFFFF ;
156
+ break_score = (break_score & 0xFFFF );
157
+
158
+ if (!break_score )
159
+ break_score = DEFAULT_BREAK_SCORE ;
160
+ if (!merge_score )
161
+ merge_score = DEFAULT_MERGE_SCORE ;
90
162
91
163
outq .nr = outq .alloc = 0 ;
92
164
outq .queue = NULL ;
@@ -101,12 +173,22 @@ void diffcore_break(int min_score)
101
173
if (DIFF_FILE_VALID (p -> one ) && DIFF_FILE_VALID (p -> two ) &&
102
174
!S_ISDIR (p -> one -> mode ) && !S_ISDIR (p -> two -> mode ) &&
103
175
!strcmp (p -> one -> path , p -> two -> path )) {
104
- score = very_different ( p -> one , p -> two , min_score );
105
- if ( min_score <= score ) {
176
+ if ( should_break ( p -> one , p -> two ,
177
+ break_score , & score ) ) {
106
178
/* Split this into delete and create */
107
179
struct diff_filespec * null_one , * null_two ;
108
180
struct diff_filepair * dp ;
109
181
182
+ /* Set score to 0 for the pair that
183
+ * needs to be merged back together
184
+ * should they survive rename/copy.
185
+ * Also we do not want to break very
186
+ * small files.
187
+ */
188
+ if ((score < merge_score ) ||
189
+ (p -> one -> size < MINIMUM_BREAK_SIZE ))
190
+ score = 0 ;
191
+
110
192
/* deletion of one */
111
193
null_one = alloc_filespec (p -> one -> path );
112
194
dp = diff_queue (& outq , p -> one , null_one );
@@ -132,3 +214,77 @@ void diffcore_break(int min_score)
132
214
133
215
return ;
134
216
}
217
+
218
+ static void merge_broken (struct diff_filepair * p ,
219
+ struct diff_filepair * pp ,
220
+ struct diff_queue_struct * outq )
221
+ {
222
+ /* p and pp are broken pairs we want to merge */
223
+ struct diff_filepair * c = p , * d = pp ;
224
+ if (DIFF_FILE_VALID (p -> one )) {
225
+ /* this must be a delete half */
226
+ d = p ; c = pp ;
227
+ }
228
+ /* Sanity check */
229
+ if (!DIFF_FILE_VALID (d -> one ))
230
+ die ("internal error in merge #1" );
231
+ if (DIFF_FILE_VALID (d -> two ))
232
+ die ("internal error in merge #2" );
233
+ if (DIFF_FILE_VALID (c -> one ))
234
+ die ("internal error in merge #3" );
235
+ if (!DIFF_FILE_VALID (c -> two ))
236
+ die ("internal error in merge #4" );
237
+
238
+ diff_queue (outq , d -> one , c -> two );
239
+ diff_free_filespec_data (d -> two );
240
+ diff_free_filespec_data (c -> one );
241
+ free (d );
242
+ free (c );
243
+ }
244
+
245
+ void diffcore_merge_broken (void )
246
+ {
247
+ struct diff_queue_struct * q = & diff_queued_diff ;
248
+ struct diff_queue_struct outq ;
249
+ int i , j ;
250
+
251
+ outq .nr = outq .alloc = 0 ;
252
+ outq .queue = NULL ;
253
+
254
+ for (i = 0 ; i < q -> nr ; i ++ ) {
255
+ struct diff_filepair * p = q -> queue [i ];
256
+ if (!p )
257
+ /* we already merged this with its peer */
258
+ continue ;
259
+ else if (p -> broken_pair &&
260
+ p -> score == 0 &&
261
+ !strcmp (p -> one -> path , p -> two -> path )) {
262
+ /* If the peer also survived rename/copy, then
263
+ * we merge them back together.
264
+ */
265
+ for (j = i + 1 ; j < q -> nr ; j ++ ) {
266
+ struct diff_filepair * pp = q -> queue [j ];
267
+ if (pp -> broken_pair &&
268
+ p -> score == 0 &&
269
+ !strcmp (pp -> one -> path , pp -> two -> path ) &&
270
+ !strcmp (p -> one -> path , pp -> two -> path )) {
271
+ /* Peer survived. Merge them */
272
+ merge_broken (p , pp , & outq );
273
+ q -> queue [j ] = NULL ;
274
+ break ;
275
+ }
276
+ }
277
+ if (q -> nr <= j )
278
+ /* The peer did not survive, so we keep
279
+ * it in the output.
280
+ */
281
+ diff_q (& outq , p );
282
+ }
283
+ else
284
+ diff_q (& outq , p );
285
+ }
286
+ free (q -> queue );
287
+ * q = outq ;
288
+
289
+ return ;
290
+ }
0 commit comments