Skip to content

Commit

Permalink
Updated newick parser to allow ';' in comments
Browse files Browse the repository at this point in the history
  • Loading branch information
fredericlemoine committed Sep 14, 2023
1 parent b7837b6 commit c7e2aa1
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 16 deletions.
23 changes: 17 additions & 6 deletions io/newick/newick_lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ func (s *Scanner) unread() {
}

// Scan returns the next token and literal value.
func (s *Scanner) Scan() (tok Token, lit string) {
// ignoreSemiColumn allows to parse identifiers that contain ";"
// such as comments [...;...]
func (s *Scanner) Scan(ignoreSemiColumn bool) (tok Token, lit string) {
// Read the next rune.
ch := s.read()

Expand All @@ -60,13 +62,15 @@ func (s *Scanner) Scan() (tok Token, lit string) {
case ',':
return NEWSIBLING, string(ch)
case ';':
return EOT, string(ch)
if !ignoreSemiColumn {
return EOT, string(ch)
}
case ':':
return STARTLEN, string(ch)
}

s.unread()
return s.scanIdent()
return s.scanIdent(ignoreSemiColumn)
}

// scanWhitespace consumes the current rune and all contiguous whitespace.
Expand All @@ -91,8 +95,15 @@ func (s *Scanner) scanWhitespace() (tok Token, lit string) {
return WS, buf.String()
}

// scanIdent consumes the current rune and all contiguous ident runes.
func (s *Scanner) scanIdent() (tok Token, lit string) {
// scanIdent consumes the current rune and all contiguous identifier runes.
// An identifier can be:
// -tip, node and branch name
// - comments
// - branch length
// - branch support
// without newick keywords. If ignore semicolumn is true, then ";" is not
// considered as a newick keyword. (useful for parsing comments [...;...])
func (s *Scanner) scanIdent(ignoreSemiColumn bool) (tok Token, lit string) {
// Create a buffer and read the current character into it.
var buf bytes.Buffer
buf.WriteRune(s.read())
Expand All @@ -102,7 +113,7 @@ func (s *Scanner) scanIdent() (tok Token, lit string) {
for {
if ch := s.read(); ch == eof {
break
} else if !isIdent(ch) {
} else if !isIdent(ch, ignoreSemiColumn) {
s.unread()
break
} else {
Expand Down
18 changes: 10 additions & 8 deletions io/newick/newick_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,17 @@ func NewParser(r io.Reader) *Parser {

// scan returns the next token from the underlying scanner.
// If a token has been unscanned then read that instead.
func (p *Parser) scan() (tok Token, lit string) {
// ignoreSemiColumn allows to parse identifiers that contain ";"
// such as comments [...;...]
func (p *Parser) scan(ignoreSemiColumn bool) (tok Token, lit string) {
// If we have a token on the buffer, then return it.
if p.buf.n != 0 {
p.buf.n = 0
return p.buf.tok, p.buf.lit
}

// Otherwise read the next token from the scanner.
tok, lit = p.s.Scan()
tok, lit = p.s.Scan(ignoreSemiColumn)

// Save it to the buffer in case we unscan later.
p.buf.tok, p.buf.lit = tok, lit
Expand All @@ -49,9 +51,9 @@ func (p *Parser) unscan() { p.buf.n = 1 }

// scanIgnoreWhitespace scans the next non-whitespace token.
func (p *Parser) scanIgnoreWhitespace() (tok Token, lit string) {
tok, lit = p.scan()
tok, lit = p.scan(false)
if tok == WS {
tok, lit = p.scan()
tok, lit = p.scan(false)
}
return
}
Expand Down Expand Up @@ -287,20 +289,20 @@ func (p *Parser) parseIter(t *tree.Tree, level *int) (prevTok Token, err error)
}
}

// Consumes comment inside brakets [comment] if the given current token is a [.
// Consumes comment inside brackets [comment] if the given current token is a [.
// At the end returns the matching ] token and lit.
// If the given token is not a [, then returns an error
func (p *Parser) consumeComment(curtoken Token, curlit string) (comment string, err error) {
if curtoken == OPENBRACK || curtoken == LABEL {
commenttoken, commentlit := p.scanIgnoreWhitespace()
commenttoken, commentlit := p.scan(true)
for (curtoken == LABEL && commenttoken != LABEL) || (curtoken == OPENBRACK && commenttoken != CLOSEBRACK) {
if commenttoken == EOF || commenttoken == ILLEGAL {
err = fmt.Errorf("unmatched bracket")
err = fmt.Errorf("unmatched bracket: %s (%s)", comment, commentlit)
return
} else {
comment += commentlit
}
commenttoken, commentlit = p.scanIgnoreWhitespace()
commenttoken, commentlit = p.scan(true)
}
} else {
err = fmt.Errorf("a comment must start with [")
Expand Down
12 changes: 10 additions & 2 deletions io/newick/newick_token.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,17 @@ func isWhitespace(ch rune) bool {
return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
}

func isIdent(ch rune) bool {
// isIdent checks whether the given rune is part of a identifier, such as:
// -tip, node and branch name
// - comments
// - branch length
// - branch support
// If it corresponds to a newick keyword, then returns false
// If ignore semicolumn is true, then ";" is not considered as
// a newick keyword. (useful for parsing comments [...;...])
func isIdent(ch rune, ignoreSemiColumn bool) bool {
return ch != '[' && ch != ']' &&
ch != '(' && ch != ')' &&
ch != ',' && ch != ':' &&
ch != ';' && ch != '\''
(ignoreSemiColumn || ch != ';') && ch != '\''
}

0 comments on commit c7e2aa1

Please sign in to comment.