-
Notifications
You must be signed in to change notification settings - Fork 28
/
jaro.go
86 lines (72 loc) · 2 KB
/
jaro.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
package smetrics
import (
"math"
)
// The Jaro distance. The result is 1 for equal strings, and 0 for completely different strings.
func Jaro(a, b string) float64 {
// If both strings are zero-length, they are completely equal,
// therefore return 1.
if len(a) == 0 && len(b) == 0 {
return 1
}
// If one string is zero-length, strings are completely different,
// therefore return 0.
if len(a) == 0 || len(b) == 0 {
return 0
}
// Define the necessary variables for the algorithm.
la := float64(len(a))
lb := float64(len(b))
matchRange := int(math.Max(0, math.Floor(math.Max(la, lb)/2.0)-1))
matchesA := make([]bool, len(a))
matchesB := make([]bool, len(b))
var matches float64 = 0
// Step 1: Matches
// Loop through each character of the first string,
// looking for a matching character in the second string.
for i := 0; i < len(a); i++ {
start := int(math.Max(0, float64(i-matchRange)))
end := int(math.Min(lb-1, float64(i+matchRange)))
for j := start; j <= end; j++ {
if matchesB[j] {
continue
}
if a[i] == b[j] {
matchesA[i] = true
matchesB[j] = true
matches++
break
}
}
}
// If there are no matches, strings are completely different,
// therefore return 0.
if matches == 0 {
return 0
}
// Step 2: Transpositions
// Loop through the matches' arrays, looking for
// unaligned matches. Count the number of unaligned matches.
unaligned := 0
j := 0
for i := 0; i < len(a); i++ {
if !matchesA[i] {
continue
}
for !matchesB[j] {
j++
}
if a[i] != b[j] {
unaligned++
}
j++
}
// The number of unaligned matches divided by two, is the number of _transpositions_.
transpositions := math.Floor(float64(unaligned / 2))
// Jaro distance is the average between these three numbers:
// 1. matches / length of string A
// 2. matches / length of string B
// 3. (matches - transpositions/matches)
// So, all that divided by three is the final result.
return ((matches / la) + (matches / lb) + ((matches - transpositions) / matches)) / 3.0
}