Skip to content

Commit

Permalink
Handle case variations algorithmically, improve startup time 100ms ->…
Browse files Browse the repository at this point in the history
… 30ms
  • Loading branch information
nickg committed May 27, 2017
1 parent 2bead47 commit 0ae1469
Show file tree
Hide file tree
Showing 8 changed files with 285 additions and 62,548 deletions.
62 changes: 62 additions & 0 deletions ascii.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package misspell

// ByteToUpper converts an ascii byte to upper cases
// Uses a branchless algorithm
func ByteToUpper(x byte) byte {
b := byte(0x80) | x
c := b - byte(0x61)
d := ^(b - byte(0x7b))
e := (c & d) & (^x & 0x7f)
return x - (e >> 2)
}

// ByteToLower converts an ascii byte to lower case
// uses a branchless algorithm
func ByteToLower(eax byte) byte {
ebx := eax&byte(0x7f) + byte(0x25)
ebx = ebx&byte(0x7f) + byte(0x1a)
ebx = ((ebx & ^eax) >> 2) & byte(0x20)
return eax + ebx
}

// ByteEqualFold does ascii compare, case insensitive
func ByteEqualFold(a, b byte) bool {
return a == b || ByteToLower(a) == ByteToLower(b)
}

// StringEqualFold ASCII case-insensitive comparison
// golang toUpper/toLower for both bytes and strings
// appears to be Unicode based which is super slow
// based from https://codereview.appspot.com/5180044/patch/14007/21002
func StringEqualFold(s1, s2 string) bool {
if len(s1) != len(s2) {
return false
}
for i := 0; i < len(s1); i++ {
c1 := s1[i]
c2 := s2[i]
// c1 & c2
if c1 != c2 {
c1 |= 'a' - 'A'
c2 |= 'a' - 'A'
if c1 != c2 || c1 < 'a' || c1 > 'z' {
return false
}
}
}
return true
}

// StringHasPrefixFold is similar to strings.HasPrefix but comparison
// is done ignoring ASCII case.
// /
func StringHasPrefixFold(s1, s2 string) bool {
// prefix is bigger than input --> false
if len(s1) < len(s2) {
return false
}
if len(s1) == len(s2) {
return StringEqualFold(s1, s2)
}
return StringEqualFold(s1[:len(s2)], s2)
}
54 changes: 18 additions & 36 deletions case.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,44 @@ package misspell

import (
"strings"
"unicode"
)

// WordCase is an enum of various word casing styles
type WordCase int

// Various WordCase types.. likely to be not correct
const (
AllLower WordCase = iota
AllUpper
Title
Mixed
Camel
CaseUnknown WordCase = iota
CaseLower
CaseUpper
CaseTitle
)

// CaseStyle returns what case style a word is in
func CaseStyle(word string) WordCase {
hasTitle := false
upperCount := 0
lowerCount := 0
runeCount := 0

// this iterates over RUNES not BYTES
for _, r := range word {
// ASCII apostrophe doesn't count
// want words like "don't" to have
// upper case forms when adding to dictionary
if r == 0x0027 {
continue
}
runeCount++
if unicode.IsLower(r) {
for i := 0; i < len(word); i++ {
ch := word[i]
switch {
case ch >= 'a' && ch <= 'z':
lowerCount++
continue
}
if unicode.IsUpper(r) {
if runeCount == 1 {
hasTitle = true
}
case ch >= 'A' && ch <= 'Z':
upperCount++
continue
}

//???
}

switch {
case runeCount == lowerCount:
return AllLower
case runeCount == upperCount:
return AllUpper
case hasTitle && runeCount-1 == lowerCount:
return Title
default:
return Mixed
case upperCount != 0 && lowerCount == 0:
return CaseUpper
case upperCount == 0 && lowerCount != 0:
return CaseLower
case upperCount == 1 && lowerCount > 0 && word[0] >= 'A' && word[0] <= 'Z':
return CaseTitle
}
return CaseUnknown
}

// CaseVariations returns
Expand All @@ -67,9 +49,9 @@ func CaseStyle(word string) WordCase {
//
func CaseVariations(word string, style WordCase) []string {
switch style {
case AllLower:
case CaseLower:
return []string{word, strings.ToUpper(word[0:1]) + word[1:], strings.ToUpper(word)}
case AllUpper:
case CaseUpper:
return []string{strings.ToUpper(word)}
default:
return []string{word, strings.ToUpper(word)}
Expand Down
12 changes: 6 additions & 6 deletions case_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ func TestCaseStyle(t *testing.T) {
word string
want WordCase
}{
{"lower", AllLower},
{"what's", AllLower},
{"UPPER", AllUpper},
{"Title", Title},
{"CamelCase", Mixed},
{"camelCase", Mixed},
{"lower", CaseLower},
{"what's", CaseLower},
{"UPPER", CaseUpper},
{"Title", CaseTitle},
{"CamelCase", CaseUnknown},
{"camelCase", CaseUnknown},
}

for pos, tt := range cases {
Expand Down
6 changes: 3 additions & 3 deletions replace.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ type Diff struct {
type Replacer struct {
Replacements []string
Debug bool
engine *strings.Replacer
engine *StringReplacer
corrected map[string]string
}

Expand Down Expand Up @@ -83,7 +83,7 @@ func (r *Replacer) Compile() {
for i := 0; i < len(r.Replacements); i += 2 {
r.corrected[r.Replacements[i]] = r.Replacements[i+1]
}
r.engine = strings.NewReplacer(r.Replacements...)
r.engine = NewStringReplacer(r.Replacements...)
}

/*
Expand All @@ -110,7 +110,7 @@ func (r *Replacer) recheckLine(s string, lineNum int, buf io.Writer, next func(D
// no replacement done
continue
}
if r.corrected[word] == newword {
if StringEqualFold(r.corrected[strings.ToLower(word)], newword) {
// word got corrected into something we know
io.WriteString(buf, s[first:ab[0]])
io.WriteString(buf, newword)
Expand Down
2 changes: 1 addition & 1 deletion replace_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ func TestReplace(t *testing.T) {

func TestCheckReplace(t *testing.T) {
r := Replacer{
engine: strings.NewReplacer("foo", "foobar", "runing", "running"),
engine: NewStringReplacer("foo", "foobar", "runing", "running"),
corrected: map[string]string{
"foo": "foobar",
"runing": "running",
Expand Down
Loading

0 comments on commit 0ae1469

Please sign in to comment.