Skip to content

Commit

Permalink
Improve AP style
Browse files Browse the repository at this point in the history
  • Loading branch information
jdkato committed Mar 1, 2024
1 parent 9623646 commit f5527d0
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 7 deletions.
18 changes: 12 additions & 6 deletions strcase/title.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ import (
"github.com/errata-ai/regexp2"
"github.com/jdkato/twine/internal"
"github.com/jdkato/twine/nlp/tag"
"github.com/jdkato/twine/nlp/tokenize"
)

var tokenizer = tokenize.NewTreebankWordTokenizer()
var tagger = tag.NewPerceptronTagger()
var smallWords = []string{
"a", "an", "and", "as", "at", "but", "by", "en", "for", "if", "in", "nor",
Expand Down Expand Up @@ -86,7 +88,15 @@ func (tc *TitleConverter) Convert(s string) string {
t := sanitizer.Replace(s)
end := len(t)

tags := tagger.Tag(splitRE.FindAllString(strings.ToLower(s), -1))
// NOTE: We do thos because the tagger is sensitive to trailing punctuation
// AND the initial case of the input.
forTagging := s
if !internal.HasAnySuffix(s, []string{".", "!", "?"}) {
forTagging = s + "."
}
words := tokenizer.Tokenize(forTagging)

tags := tagger.Tag(words)
widx := -1

return prefix + splitRE.ReplaceAllStringFunc(s, func(m string) string {
Expand Down Expand Up @@ -133,11 +143,7 @@ func (tc *TitleConverter) inVocab(s string) string {
// See testdata/AP.json for examples.
func optionsAP(word string, tags []tag.Token, idx int, bounding bool) bool {
if word == "to" && idx+1 < len(tags) {
t1 := strings.HasPrefix(tags[idx+1].Tag, "NN")
if !t1 && idx+2 < len(tags) {
return strings.HasPrefix(tags[idx+2].Tag, "NN")
}
return t1
return strings.HasPrefix(tags[idx+1].Tag, "NN")
}
return !bounding && internal.StringInSlice(word, smallWords)
}
Expand Down
10 changes: 9 additions & 1 deletion testdata/AP.json
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,15 @@
"expect": "What I Want To Be When I Grow Up"
},
{
"input": "we went to las vegas to drink",
"input": "we went to Las Vegas to drink",
"expect": "We Went to Las Vegas To Drink"
},
{
"input": "Connect Consul to Gateway",
"expect": "Connect Consul to Gateway"
},
{
"input": "Connect Consul to Gateway.",
"expect": "Connect Consul to Gateway."
}
]

0 comments on commit f5527d0

Please sign in to comment.