forked from olivia-ai/olivia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathformat.go
108 lines (85 loc) · 2.82 KB
/
format.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package analysis
import (
"fmt"
"regexp"
"strings"
"github.com/olivia-ai/olivia/locales"
"github.com/olivia-ai/olivia/util"
"github.com/tebeka/snowball"
)
// arrange checks the format of a string to normalize it, remove ignored characters
func (sentence *Sentence) arrange() {
// Remove punctuation after letters
punctuationRegex := regexp.MustCompile(`[a-zA-Z]( )?(\.|\?|!|¿|¡)`)
sentence.Content = punctuationRegex.ReplaceAllStringFunc(sentence.Content, func(s string) string {
punctuation := regexp.MustCompile(`(\.|\?|!)`)
return punctuation.ReplaceAllString(s, "")
})
sentence.Content = strings.ReplaceAll(sentence.Content, "-", " ")
sentence.Content = strings.TrimSpace(sentence.Content)
}
// removeStopWords takes an arary of words, removes the stopwords and returns it
func removeStopWords(locale string, words []string) []string {
// Don't remove stopwords for small sentences like “How are you” because it will remove all the words
if len(words) <= 4 {
return words
}
// Read the content of the stopwords file
stopWords := string(util.ReadFile("res/locales/" + locale + "/stopwords.txt"))
var wordsToRemove []string
// Iterate through all the stopwords
for _, stopWord := range strings.Split(stopWords, "\n") {
// Iterate through all the words of the given array
for _, word := range words {
// Continue if the word isn't a stopword
if !strings.Contains(stopWord, word) {
continue
}
wordsToRemove = append(wordsToRemove, word)
}
}
return util.Difference(words, wordsToRemove)
}
// tokenize returns a list of words that have been lower-cased
func (sentence Sentence) tokenize() (tokens []string) {
// Split the sentence in words
tokens = strings.Fields(sentence.Content)
// Lower case each word
for i, token := range tokens {
tokens[i] = strings.ToLower(token)
}
tokens = removeStopWords(sentence.Locale, tokens)
return
}
// stem returns the sentence split in stemmed words
func (sentence Sentence) stem() (tokenizeWords []string) {
locale := locales.GetTagByName(sentence.Locale)
// Set default locale to english
if locale == "" {
locale = "english"
}
tokens := sentence.tokenize()
stemmer, err := snowball.New(locale)
if err != nil {
fmt.Println("Stemmer error", err)
return
}
// Get the string token and push it to tokenizeWord
for _, tokenizeWord := range tokens {
word := stemmer.Stem(tokenizeWord)
tokenizeWords = append(tokenizeWords, word)
}
return
}
// WordsBag retrieves the intents words and returns the sentence converted in a bag of words
func (sentence Sentence) WordsBag(words []string) (bag []float64) {
for _, word := range words {
// Append 1 if the patternWords contains the actual word, else 0
var valueToAppend float64
if util.Contains(sentence.stem(), word) {
valueToAppend = 1
}
bag = append(bag, valueToAppend)
}
return bag
}