-
Notifications
You must be signed in to change notification settings - Fork 22
/
Copy pathmerge.go
331 lines (286 loc) · 11 KB
/
merge.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
// Merging
//
// There are several functions available that handle different kinds of merging:
//
// - MergeNodes(left, right Node) Node: returns a new node that merges children
// from both nodes.
//
// - MergeNodeSlices(left, right Nodes, mergeFn MergeFunction) Nodes: merges
// two slices based on the mergeFn. This allows more advanced merging when
// dealing with slices of nodes.
//
// - MergeDocuments(left, right *Document, mergeFn MergeFunction) *Document:
// creates a new document with their respective nodes merged. You can use
// IndividualBySurroundingSimilarityMergeFunction with this to merge
// individuals, rather than just appending them all.
//
// The MergeFunction is a type that can be received in some of the merging
// functions. The closure determines if two nodes should be merged and what the
// result would be. Alternatively it can also describe when two nodes should not
// be merged.
//
// You may certainly create your own MergeFunction, but there are some that are
// already included:
//
// - IndividualBySurroundingSimilarityMergeFunction creates a MergeFunction that
// will merge individuals if their surrounding similarity is at least
// minimumSimilarity.
//
// - EqualityMergeFunction is a MergeFunction that will return a merged node if
// the node are considered equal (with Equals).
package gedcom
import (
"errors"
"fmt"
)
// MergeFunction will do one of two things:
//
// 1. If the nodes should be merged, it must return a new new node.
//
// 2. If the nodes should not or could not be merged, then nil is returned.
//
// A MergeFunction can be used with MergeNodeSlices.
//
// The document parameter is used as the destination for merged nodes that need
// to be attached to a document (such as individuals).
type MergeFunction func(left, right Node, document *Document) Node
// MergeNodes returns a new node that merges children from both nodes.
//
// If either of the nodes are nil, or they are not the same tag an error will be
// returned and the result node will be nil.
//
// The node returned and all of the merged children will be created as new
// nodes as to not interfere with the original input.
//
// The document must not be nil and will be used to attach the new nodes to
// (since some nodes require a document, such as individuals). You may supply
// the same document.
func MergeNodes(left, right Node, document *Document) (Node, error) {
if IsNil(left) {
return nil, errors.New("left is nil")
}
if IsNil(right) {
return nil, errors.New("right is nil")
}
leftTag := left.Tag()
rightTag := right.Tag()
// We can only proceed if the nodes can be merged.
if !leftTag.Is(rightTag) {
return nil, fmt.Errorf("cannot merge %s and %s nodes",
leftTag.Tag(), rightTag.Tag())
}
r := DeepCopy(left, document)
for _, child := range right.Nodes() {
for _, n := range r.Nodes() {
if n.Equals(child) {
newNodes := MergeNodeSlices(child.Nodes(), n.Nodes(), document,
EqualityMergeFunction)
n.SetNodes(newNodes)
goto next
}
}
r.AddNode(child)
next:
}
return r, nil
}
// MergeNodeSlices merges two slices based on the mergeFn.
//
// The MergeFunction must not be nil, but may return nil. See MergeFunction for
// usage.
//
// The left and right may contain zero elements or be nil, these mean the same
// thing.
//
// MergeNodeSlices makes some guarantees about the result:
//
// 1. The result slice will contain at least the length of the greatest length
// of left and right. So if len(left) = 3 and len(right) = 5 then the result
// slice will have a minimum of 5 items. If both slices are empty or nil the
// minimum length will also be zero.
//
// 2. The result slice will not contain more elements than the sum of the
// lengths of the left and right. So if len(left) = 3 and len(right) = 5 then
// the largest possible slice returned is 8.
//
// 3. All of the nodes returned will be deep copies of the original nodes so it
// is safe to manipulate the result in any way without affecting the original
// input slices.
//
// 4. Any element from the left or right may only be merged once. That is to say
// that if a merge happens between a left and right node that the result node
// cannot be merged again.
//
// 5. Merges can only happen between a node on the left with a node on the
// right. Even if two nodes in the left could be merged they will not be. The
// same goes for all of the elements in the right slice.
//
// The document must not be nil and will be used to attach the new nodes to
// (since some nodes require a document, such as individuals). You may supply
// the same document.
func MergeNodeSlices(left, right Nodes, document *Document, mergeFn MergeFunction) Nodes {
newSlice := Nodes{}
// Be careful to duplicate the right slice. I'm not sure why this is
// necessary but the unit tests that reuse mergeDocumentsTests will fail if
// we do not have this.
newRight := make(Nodes, len(right))
copy(newRight, right)
right = newRight
// We start by adding all of the items on the left.
for _, node := range left {
newSlice = append(newSlice, DeepCopy(node, document))
}
// Each of the items on the right must be compared with all of the items in
// the slice, which starts off with all the items from the left but will
// grow.
//
// If the right item does not match anything previously seen then it is
// appended to the end. Otherwise the left node is removed and the new
// merged node is place on the end of the new slice.
//
// We can guarantee that all the items on the left will be inserted once.
// However, one obvious problem is that the items on the right may be merged
// multiple times into a single left element, or even be merged into an
// element previously appended from the right.
//
// To get around this behavior we have to keep track of when a node has
// already been replaced with a merged one. This gives us the same
// guarantees for the items on the right.
alreadyMerged := NodeSet{}
for len(right) > 0 {
found := false
for i := 0; i < len(newSlice); i++ {
node := newSlice[i]
// Skip any nodes that were previously marked as merged.
if alreadyMerged.Has(node) {
goto next
}
for j, node2 := range right {
merged := mergeFn(node, node2, document)
if !IsNil(merged) {
// Remove the current node, and append the new merged one.
// This will change the order, but the order of nodes
// doesn't matter in GEDCOM files.
newSlice = append(newSlice[:i], newSlice[i+1:]...)
newSlice = append(newSlice, merged)
// We also have to remove the matching right node, otherwise
// we may get stuck into an infinite loop.
right = append(right[:j], right[j+1:]...)
// Record the fact that this new node has been merged so it
// will be avoided next time, even in the case of a merge
// candidate.
alreadyMerged.Add(merged)
found = true
break
}
}
next:
}
if !found {
// See the comment above about why we need to mark the new right
// node as already merged.
newNode := DeepCopy(right[0], document)
alreadyMerged.Add(newNode)
newSlice = append(newSlice, newNode)
right = right[1:]
}
}
return newSlice
}
// MergeDocuments creates a new document with their respective nodes merged.
//
// The MergeFunction must not be nil, but may return nil. See MergeFunction for
// usage.
//
// The left and right may be nil. This is treated as an empty document.
//
// The result document will have a deep copy of all nodes. So it's safe to
// manipulate the nodes without affecting the original nodes.
//
// Individuals will not be merged amongst each other, only appended to the final
// document. To merge similar individuals see MergeDocumentsAndIndividuals.
//
// The document (third parameter) must not be nil and will be used to attach the
// new nodes to (since some nodes require a document, such as individuals). You
// may supply the same document.
func MergeDocuments(left, right *Document, document *Document, mergeFn MergeFunction) *Document {
var leftNodes, rightNodes Nodes
if left != nil {
leftNodes = left.Nodes()
}
if right != nil {
rightNodes = right.Nodes()
}
newNodes := MergeNodeSlices(leftNodes, rightNodes, document, mergeFn)
return NewDocumentWithNodes(newNodes)
}
// MergeDocumentsAndIndividuals merges two documents while also merging similar
// individuals. A new document will be returned.
//
// The MergeFunction must not be nil, but may return nil. It will only be used
// for nodes that are not individuals. See MergeFunction for usage.
//
// The options must be provided.
func MergeDocumentsAndIndividuals(left, right *Document, mergeFn MergeFunction, options *IndividualNodesCompareOptions) (*Document, error) {
// Merge individuals.
leftIndividuals := individuals(left)
rightIndividuals := individuals(right)
document := NewDocument()
mergedIndividuals, err := leftIndividuals.Merge(rightIndividuals, document, options)
if err != nil {
return nil, err
}
// Merge non-individuals.
leftOther := nonIndividuals(left)
rightOther := nonIndividuals(right)
mergedOther := MergeNodeSlices(leftOther, rightOther, document, mergeFn)
allNodes := append(mergedIndividuals.Nodes(), mergedOther...)
return NewDocumentWithNodes(allNodes), nil
}
// IndividualBySurroundingSimilarityMergeFunction creates a MergeFunction that
// will merge individuals if their surrounding similarity is at least
// minimumSimilarity.
//
// If either of the nodes are not IndividualNode instances then it will always
// return nil so they will not be merged.
//
// The minimumSimilarity should be value between 0.0 and 1.0. The options must
// not be nil, you should use NewSimilarityOptions() for sensible defaults.
//
// The document must not be nil and will determine where the merged nodes will
// end up. This is important for nodes that need to be attached to a document.
func IndividualBySurroundingSimilarityMergeFunction(minimumSimilarity float64, options SimilarityOptions) MergeFunction {
return func(left, right Node, document *Document) Node {
leftIndividual, leftOK := left.(*IndividualNode)
rightIndividual, rightOK := right.(*IndividualNode)
// Either side is not an individual.
if !leftOK || !rightOK {
return nil
}
similarity := leftIndividual.SurroundingSimilarity(rightIndividual, options, false)
if similarity.WeightedSimilarity() > minimumSimilarity {
// Ignore the error here because left and right must be the same
// type.
mergedIndividuals, _ := MergeNodes(left, right, document)
return mergedIndividuals
}
// Do not merge.
return nil
}
}
// EqualityMergeFunction is a MergeFunction that will return a merged node if
// the node are considered equal (with Equals). This is probably the most
// generic case when doing general merges.
//
// The document must not be nil and will be used to attach the merged nodes as
// some nodes need a document to be created (such as Individual nodes).
func EqualityMergeFunction(left, right Node, document *Document) Node {
if left.Equals(right) {
merged, err := MergeNodes(left, right, document)
if err != nil {
return nil
}
return merged
}
return nil
}