Skip to content

Commit 831d05d

Browse files
committed
Strip diacritics from sort keys if query is ASCII
1 parent f94c02c commit 831d05d

File tree

3 files changed

+118
-49
lines changed

3 files changed

+118
-49
lines changed

doc.go

+2
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,8 @@ on your own structs and customising the sort settings.
9595
The algorithm is based on Forrest Smith's reverse engineering of Sublime
9696
Text's search: https://blog.forrestthewoods.com/reverse-engineering-sublime-text-s-fuzzy-match-4cffeed33fdb
9797
98+
It additionally strips diacritics from sort keys if the query is ASCII.
99+
98100
99101
Sending results to Alfred
100102

fuzzy.go

+95-49
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,13 @@
99
package aw
1010

1111
import (
12+
"log"
1213
"sort"
1314
"strings"
15+
"unicode"
16+
17+
"golang.org/x/text/transform"
18+
"golang.org/x/text/unicode/norm"
1419
)
1520

1621
// Default bonuses and penalties for fuzzy sorting. To customise
@@ -24,8 +29,15 @@ const (
2429
DefaultLeadingLetterPenalty = -3.0 // Penalty applied for every letter in string before first match
2530
DefaultMaxLeadingLetterPenalty = -9.0 // Maximum penalty for leading letters
2631
DefaultUnmatchedLetterPenalty = -1.0 // Penalty for every letter that doesn't match
32+
DefaultStripDiacritics = true // Strip diacritics from sort keys if query is plain ASCII
2733
)
2834

35+
var stripper transform.Transformer
36+
37+
func init() {
38+
stripper = transform.Chain(norm.NFD, transform.RemoveFunc(isMn), norm.NFC)
39+
}
40+
2941
// Sortable makes the implementer fuzzy-sortable. It is a superset
3042
// of sort.Interface (i.e. your struct must also implement sort.Interface).
3143
type Sortable interface {
@@ -57,6 +69,7 @@ type SortOptions struct {
5769
LeadingLetterPenalty float64 // Penalty applied for every letter in string before first match
5870
MaxLeadingLetterPenalty float64 // Maximum penalty for leading letters
5971
UnmatchedLetterPenalty float64 // Penalty for every letter that doesn't match
72+
StripDiacritics bool // Strip diacritics from sort keys if query is plain ASCII
6073
}
6174

6275
// NewSortOptions creates a SortOptions object with the default values.
@@ -68,6 +81,7 @@ func NewSortOptions() *SortOptions {
6881
LeadingLetterPenalty: DefaultLeadingLetterPenalty,
6982
MaxLeadingLetterPenalty: DefaultMaxLeadingLetterPenalty,
7083
UnmatchedLetterPenalty: DefaultUnmatchedLetterPenalty,
84+
StripDiacritics: DefaultStripDiacritics,
7185
}
7286
}
7387

@@ -77,8 +91,10 @@ type Sorter struct {
7791
Data Sortable
7892
// Options contains the bonuses and penalties
7993
Options *SortOptions
80-
// results stores the results of the fuzzy sort
81-
results []*Result
94+
95+
query string // Search query
96+
stripDiacritics bool // Whether sort keys need folding
97+
results []*Result // Results of the fuzzy sort
8298
}
8399

84100
// NewSorter returns a new Sorter. If opts is nil, Sorter is initialised
@@ -116,57 +132,31 @@ func (s *Sorter) Swap(i, j int) {
116132

117133
// Sort sorts data against query.
118134
func (s *Sorter) Sort(query string) []*Result {
119-
if s.results == nil {
120-
s.results = make([]*Result, s.Data.Len())
135+
s.results = make([]*Result, s.Data.Len())
136+
s.query = query
137+
if isASCII(query) && s.Options.StripDiacritics {
138+
s.stripDiacritics = true
121139
}
122-
140+
// Generate matches for Data, then call sort.Sort()
123141
for i := 0; i < s.Data.Len(); i++ {
124142
key := s.Data.SortKey(i)
125-
// s.matches[i] = match
126-
// s.scores[i] = score
127-
s.results[i] = Match(key, query, s.Options)
143+
s.results[i] = s.Match(key)
128144
}
129145
sort.Sort(s)
130146
return s.results
131147
}
132148

133-
// Sort sorts data against query. Convenience that creates and
134-
// uses a Sorter with the default settings.
135-
func Sort(data Sortable, query string) []*Result {
136-
s := NewSorter(data, nil)
137-
return s.Sort(query)
138-
}
139-
140-
// stringSlice implements sort.Interface for []string.
141-
// It is a helper for SortStrings.
142-
type stringSlice struct {
143-
data []string
144-
}
145-
146-
// Len etc. implement sort.Interface.
147-
func (s stringSlice) Len() int { return len(s.data) }
148-
func (s stringSlice) Less(i, j int) bool { return s.data[i] < s.data[j] }
149-
func (s stringSlice) Swap(i, j int) { s.data[i], s.data[j] = s.data[j], s.data[i] }
150-
151-
// SortKey implements Sortable.
152-
func (s stringSlice) SortKey(i int) string { return s.data[i] }
153-
154-
// Sort is a convenience method.
155-
func (s stringSlice) Sort(query string) []*Result { return Sort(s, query) }
156-
157-
// SortStrings is a convenience function.
158-
func SortStrings(data []string, query string) []*Result {
159-
s := stringSlice{data}
160-
return s.Sort(query)
161-
}
162-
163149
// Match scores str against query using fuzzy matching and the specified sort options.
164-
func Match(str, query string, o *SortOptions) *Result {
150+
func (s *Sorter) Match(str string) *Result {
151+
if s.stripDiacritics {
152+
str = stripDiacritics(str)
153+
}
154+
165155
var (
166156
match = false
167157
score = 0.0
168158
uStr = []rune(str)
169-
uQuery = []rune(query)
159+
uQuery = []rune(s.query)
170160
strLen = len(uStr)
171161
queryLen = len(uQuery)
172162
)
@@ -232,26 +222,26 @@ func Match(str, query string, o *SortOptions) *Result {
232222

233223
// Apply penalty for letters before first match
234224
if queryIdx == 0 {
235-
penalty = float64(strIdx) * o.LeadingLetterPenalty
236-
if penalty <= o.MaxLeadingLetterPenalty {
237-
penalty = o.MaxLeadingLetterPenalty
225+
penalty = float64(strIdx) * s.Options.LeadingLetterPenalty
226+
if penalty <= s.Options.MaxLeadingLetterPenalty {
227+
penalty = s.Options.MaxLeadingLetterPenalty
238228
}
239229
score += penalty
240230
}
241231

242232
// Apply bonus for consecutive matches
243233
if prevMatched {
244-
newScore += o.AdjacencyBonus
234+
newScore += s.Options.AdjacencyBonus
245235
}
246236

247237
// Apply bonus for match after separator
248238
if prevSeparator {
249-
newScore += o.SeparatorBonus
239+
newScore += s.Options.SeparatorBonus
250240
}
251241

252242
// Apply bonus across camel case boundaries
253243
if prevLower && strChar == strUpper && strLower != strUpper {
254-
newScore += o.CamelBonus
244+
newScore += s.Options.CamelBonus
255245
}
256246

257247
// Update query index if next query letter was matched
@@ -264,7 +254,7 @@ func Match(str, query string, o *SortOptions) *Result {
264254
if newScore >= bestLetterScore {
265255

266256
if bestLetter != "" {
267-
score += o.UnmatchedLetterPenalty
257+
score += s.Options.UnmatchedLetterPenalty
268258
}
269259

270260
bestLetter = strChar
@@ -274,7 +264,7 @@ func Match(str, query string, o *SortOptions) *Result {
274264

275265
prevMatched = true
276266
} else {
277-
score += o.UnmatchedLetterPenalty
267+
score += s.Options.UnmatchedLetterPenalty
278268
prevMatched = false
279269
}
280270

@@ -303,5 +293,61 @@ func Match(str, query string, o *SortOptions) *Result {
303293
}
304294

305295
// log.Printf("query=%#v, str=%#v", match=%v, score=%v, query, str, match, score)
306-
return &Result{match, query, score, str}
296+
return &Result{match, s.query, score, str}
297+
}
298+
299+
// Sort sorts data against query. Convenience that creates and
300+
// uses a Sorter with the default settings.
301+
func Sort(data Sortable, query string) []*Result {
302+
s := NewSorter(data, nil)
303+
return s.Sort(query)
304+
}
305+
306+
// stringSlice implements sort.Interface for []string.
307+
// It is a helper for SortStrings.
308+
type stringSlice struct {
309+
data []string
310+
}
311+
312+
// Len etc. implement sort.Interface.
313+
func (s stringSlice) Len() int { return len(s.data) }
314+
func (s stringSlice) Less(i, j int) bool { return s.data[i] < s.data[j] }
315+
func (s stringSlice) Swap(i, j int) { s.data[i], s.data[j] = s.data[j], s.data[i] }
316+
317+
// SortKey implements Sortable.
318+
func (s stringSlice) SortKey(i int) string { return s.data[i] }
319+
320+
// Sort is a convenience method.
321+
func (s stringSlice) Sort(query string) []*Result { return Sort(s, query) }
322+
323+
// SortStrings is a convenience function.
324+
func SortStrings(data []string, query string) []*Result {
325+
s := stringSlice{data}
326+
return s.Sort(query)
327+
}
328+
329+
// Match scores str against query using fuzzy matching and the specified sort options.
330+
// WARNING: Match creates a new Sorter for every call. Don't use this on
331+
// large datasets.
332+
func Match(str, query string, o *SortOptions) *Result {
333+
data := stringSlice{[]string{str}}
334+
s := NewSorter(data, o)
335+
return s.Sort(query)[0]
336+
}
337+
338+
func isMn(r rune) bool {
339+
return unicode.Is(unicode.Mn, r) // Mn: non-spacing mark
340+
}
341+
342+
func stripDiacritics(s string) string {
343+
stripped, _, err := transform.String(stripper, s)
344+
if err != nil {
345+
log.Printf("Couldn't strip diacritics from `%s`: %s", s, err)
346+
return s
347+
}
348+
return stripped
349+
}
350+
351+
func isASCII(s string) bool {
352+
return stripDiacritics(s) == s
307353
}

fuzzy_test.go

+21
Original file line numberDiff line numberDiff line change
@@ -191,3 +191,24 @@ func TestFilterFeedback(t *testing.T) {
191191
}
192192
}
193193
}
194+
195+
// TestStripDiacritics
196+
func TestStripDiacritics(t *testing.T) {
197+
o := NewSortOptions()
198+
// Non-ASCII query and data
199+
if r := Match("fün", "fün", o); r.Match == false {
200+
t.Fatalf("fün != fün (diacritic stripping on): %v", r)
201+
}
202+
// Non-ASCII data
203+
if r := Match("fün", "fun", o); r.Match == false {
204+
t.Fatalf("fun != fün (diacritic stripping on): %v", r)
205+
}
206+
// No diacritic stripping
207+
o.StripDiacritics = false
208+
if r := Match("fün", "fün", o); r.Match == false {
209+
t.Fatalf("fün != fün (diacritic stripping off): %s", r)
210+
}
211+
if r := Match("fün", "fun", o); r.Match == true {
212+
t.Fatalf("fun != fün (diacritic stripping off): %s", r)
213+
}
214+
}

0 commit comments

Comments
 (0)