diff --git a/lex.go b/lex.go index 4b69cbea..a39b8a08 100644 --- a/lex.go +++ b/lex.go @@ -65,6 +65,7 @@ type LexOptions struct { // RegisterLexer is an option that cna be used to add a lexer to tokenize external NGINX tokens. type RegisterLexer interface { applyLexOptions(options *LexOptions) + applyScannerOptions(options *scannerOptions) } type registerLexer struct { @@ -82,6 +83,16 @@ func (rl registerLexer) applyLexOptions(o *LexOptions) { } } +func (rl registerLexer) applyScannerOptions(o *scannerOptions) { + if o.extensions == nil { + o.extensions = make(map[string]ScannerExt) + } + + for _, s := range rl.stringTokens { + o.extensions[s] = &LexerScanner{lexer: rl.l} + } +} + // LexWithLexer registers a Lexer that implements tokenization of an NGINX configuration after one of the given // stringTokens is encountered by Lex. func LexWithLexer(l Lexer, stringTokens ...string) RegisterLexer { //nolint:ireturn @@ -106,12 +117,38 @@ func Lex(reader io.Reader) chan NgxToken { // SubScanner provides an interface for scanning alternative grammars within NGINX configuration data. type SubScanner struct { scanner *bufio.Scanner + parent *Scanner tokenLine int } // Scan advances the scanner to the next token which will be available though the Text method. It returns false // when the scan stops by reaching the end of input. func (e *SubScanner) Scan() bool { + if e.scanner != nil { + return e.lexScan() + } + + if e.parent.err != nil { + return false + } + + if !e.parent.scanner.Scan() { + if err := e.parent.scanner.Err(); err != nil { + e.parent.setErr(err) + } + return false + } + + // e.parent.prev = e.parent.scanner.Text() + // if isEOL(e.parent.prev) { + if t := e.parent.scanner.Text(); isEOL(t) { + e.parent.lineno++ + } + + return true +} + +func (e *SubScanner) lexScan() bool { if !e.scanner.Scan() { return false } @@ -122,13 +159,30 @@ func (e *SubScanner) Scan() bool { } // Err returns the fist non-EOF error encountered by the Scanner. -func (e *SubScanner) Err() error { return e.scanner.Err() } +func (e *SubScanner) Err() error { + if e.scanner != nil { + return e.scanner.Err() + } + return e.parent.Err() +} // Text returns the most recent token generated by a call to Scan. -func (e *SubScanner) Text() string { return e.scanner.Text() } +func (e *SubScanner) Text() string { + if e.scanner != nil { + return e.scanner.Text() + } + // return e.parent.prev + return e.parent.scanner.Text() +} // Line returns the line number of the most recent token generated by a call to Scan. -func (e *SubScanner) Line() int { return e.tokenLine } +func (e *SubScanner) Line() int { + if e.scanner != nil { + return e.tokenLine + } + + return e.parent.lineno +} //nolint:gocyclo,funlen,gocognit,maintidx func tokenize(reader io.Reader, tokenCh chan NgxToken, options LexOptions) { diff --git a/lex_test.go b/lex_test.go index cb3c5148..d7348089 100644 --- a/lex_test.go +++ b/lex_test.go @@ -415,6 +415,20 @@ var lexFixtures = []lexFixture{ {"}", 20}, {"}", 21}, }}, + {"comments-between-args", []tokenLine{ + {"http", 1}, + {"{", 1}, + {"#comment 1", 1}, + {"log_format", 2}, + {"#comment 2", 2}, + {"\\#arg\\ 1", 3}, + {"#comment 3", 3}, + {"#arg 2", 4}, + {"#comment 4", 4}, + {"#comment 5", 5}, + {";", 6}, + {"}", 7}, + }}, } func TestLex(t *testing.T) { @@ -446,22 +460,72 @@ func TestLex(t *testing.T) { } } -func TestLex_unhappy(t *testing.T) { - t.Parallel() +func benchmarkLex(b *testing.B, path string, options LexOptions) { + var t NgxToken + + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + for tok := range LexWithOptions(file, options) { + t = tok + } + } + + _ = t +} + +func BenchmarkLex(b *testing.B) { + for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } - testcases := map[string]string{ - "unbalanced open brance": `http {{}`, - "unbalanced closing brace": `http {}}`, - "multiple open braces": `http {{server {}}`, - "multiple closing braces after block end": `http {server {}}}`, - "multiple semicolons": `server { listen 80;; }`, - "semicolon afer closing brace": `server { listen 80; };`, - "open brace after semicolon": `server { listen 80; {}`, - "braces with no directive": `http{}{}`, - "missing final brace": `http{`, + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkLex(b, path, LexOptions{}) + }) } +} + +func BenchmarkLexWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } + + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkLex(b, path, LexOptions{Lexers: []RegisterLexer{lua.RegisterLexer()}}) + }) + } +} + +//nolint:gochecknoglobals +var unhappyFixtures = map[string]string{ + "unbalanced open brance": `http {{}`, + "unbalanced closing brace": `http {}}`, + "multiple open braces": `http {{server {}}`, + "multiple closing braces after block end": `http {server {}}}`, + "multiple semicolons": `server { listen 80;; }`, + "semicolon afer closing brace": `server { listen 80; };`, + "open brace after semicolon": `server { listen 80; {}`, + "braces with no directive": `http{}{}`, + "missing final brace": `http{`, +} + +func TestLex_unhappy(t *testing.T) { + t.Parallel() - for name, c := range testcases { + for name, c := range unhappyFixtures { c := c t.Run(name, func(t *testing.T) { t.Parallel() diff --git a/scanner.go b/scanner.go new file mode 100644 index 00000000..b688f078 --- /dev/null +++ b/scanner.go @@ -0,0 +1,349 @@ +package crossplane + +import ( + "bufio" + "errors" + "fmt" + "io" + "strings" +) + +type scannerOptions struct { + extensions map[string]ScannerExt +} + +type ScannerOption interface { + applyScannerOptions(options *scannerOptions) +} + +// Token is a lexical token of the NGINX configuration syntax. +type Token struct { + // Text is the string corresponding to the token. It could be a directive or symbol. The value is the actual token + // sequence in order to support defining directives in modules other than the core NGINX module set. + Text string + // Line is the source starting line number the token within a file. + Line int + // IsQuoted signifies if the token is wrapped by quotes (", '). Quotes are not usually necessary in an NGINX + // configuration and mostly serve to help make the config less ambiguous. + IsQuoted bool +} + +func (t Token) String() string { return fmt.Sprintf("{%d, %s, %t}", t.Line, t.Text, t.IsQuoted) } + +type scannerError struct { + msg string + line int +} + +func (e *scannerError) Error() string { return e.msg } +func (e *scannerError) Line() int { return e.line } + +func newScannerErrf(line int, format string, a ...any) *scannerError { + return &scannerError{line: line, msg: fmt.Sprintf(format, a...)} +} + +// LineNumber reports the line on which the error occurred by finding the first error in +// the errs chain that returns a line number. Otherwise, it returns 0, false. +// +// An error type should provide a Line() int method to return a line number. +func LineNumber(err error) (int, bool) { + var e interface{ Line() int } + if !errors.As(err, &e) { + return 0, false + } + + return e.Line(), true +} + +// Scanner provides an interface for tokenizing an NGINX configuration. Successive calls to the Scane method will step +// through the 'tokens; of an NGINX configuration. +// +// Scanning stops unrecoverably at EOF, the first I/O error, or an unexpected token. +// +// Use NewScanner to construct a Scanner. +type Scanner struct { + scanner *bufio.Scanner + lineno int + tokenStartLine int + tokenDepth int + repeateSpecialChar bool // only '}' can be repeated + nextTokenIsDirective bool + prev string + err error + options *scannerOptions + ext Tokenizer +} + +// NewScanner returns a new Scanner to read from r. +func NewScanner(r io.Reader, options ...ScannerOption) *Scanner { + opts := &scannerOptions{} + for _, opt := range options { + opt.applyScannerOptions(opts) + } + + s := &Scanner{ + scanner: bufio.NewScanner(r), + lineno: 1, + tokenStartLine: 1, + tokenDepth: 0, + repeateSpecialChar: false, + nextTokenIsDirective: true, + options: opts, + } + + s.scanner.Split(bufio.ScanRunes) + + return s +} + +// Err returns the first non-EOF error that was encountered by the Scanner. +func (s *Scanner) Err() error { + if s.err == io.EOF { + return nil + } + return s.err +} + +func (s *Scanner) setErr(err error) { + if s.err == nil || s.err != io.EOF { + s.err = err + } +} + +// Scan reads the next token from source and returns it.. It returns io.EOF at the end of the source. Scanner errors are +// returned when encountered. +func (s *Scanner) Scan() (Token, error) { //nolint: funlen, gocognit, gocyclo, maintidx // sorry + if s.ext != nil { + t, err := s.ext.Next() + if err != nil { + if !errors.Is(err, ErrTokenizerDone) { + s.setErr(err) + return Token{}, s.err + } + + s.ext = nil + } else { + return t, nil + } + } + + var tok strings.Builder + + lexState := skipSpace + newToken := false + readNext := true + esc := false + + var r, quote string + + for { + if s.err != nil { + return Token{}, s.err + } + + switch { + case s.prev != "": + r, s.prev = s.prev, "" + case readNext: + if !s.scanner.Scan() { + if tok.Len() > 0 { + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: lexState == inQuote}, nil + } + + if s.tokenDepth > 0 { + s.setErr(&scannerError{line: s.tokenStartLine, msg: "unexpected end of file, expecting }"}) + return Token{}, s.err + } + + s.setErr(io.EOF) + return Token{}, s.err + } + + nextRune := s.scanner.Text() + r = nextRune + if isEOL(r) { + s.lineno++ + s.nextTokenIsDirective = true + } + default: + readNext = true + } + + // skip CRs + if r == "\r" || r == "\\\r" { + continue + } + + if r == "\\" && !esc { + esc = true + continue + } + + if esc { + esc = false + r = "\\" + r + } + + if tok.Len() > 0 { + t := tok.String() + if s.nextTokenIsDirective { + if ext, ok := s.options.extensions[t]; ok { + s.ext = ext.Tokenizer(&SubScanner{parent: s, tokenLine: s.tokenStartLine}, t) + return Token{Text: t, Line: s.tokenStartLine}, nil + } + } + } + + switch lexState { + case skipSpace: + if !isSpace(r) { + lexState = inWord + newToken = true + readNext = false // re-eval + s.tokenStartLine = s.lineno + } + continue + + case inWord: + if newToken { + newToken = false + if r == "#" { + tok.WriteString(r) + lexState = inComment + s.tokenStartLine = s.lineno + s.nextTokenIsDirective = false + continue + } + } + + if isSpace(r) { + s.nextTokenIsDirective = false + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // parameter expansion syntax (ex: "${var[@]}") + if tok.Len() > 0 && strings.HasSuffix(tok.String(), "$") && r == "{" { + tok.WriteString(r) + lexState = inVar + s.repeateSpecialChar = false + s.nextTokenIsDirective = false + continue + } + + // add entire quoted string to the token buffer + if r == `"` || r == "'" { + if tok.Len() > 0 { + // if a quote is inside a token, treat it like any other char + tok.WriteString(r) + } else { + quote = r + lexState = inQuote + s.tokenStartLine = s.lineno + } + s.repeateSpecialChar = false + continue + } + + // special characters treated as full tokens + if isSpecialChar(r) { + if tok.Len() > 0 { + s.prev = r + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + // only } can be repeated + if s.repeateSpecialChar && r != "}" { + s.setErr(newScannerErrf(s.tokenStartLine, "unxpected %q", r)) + return Token{}, s.err + } + + s.repeateSpecialChar = true + if r == "{" { + s.tokenDepth++ + } + + if r == "}" { + s.tokenDepth-- + if s.tokenDepth < 0 { + s.setErr(&scannerError{line: s.tokenStartLine, msg: `unexpected "}"`}) + return Token{}, s.err + } + } + + tok.WriteString(r) + s.nextTokenIsDirective = true + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + + s.repeateSpecialChar = false + tok.WriteString(r) + case inComment: + if isEOL(r) { + return Token{Text: tok.String(), Line: s.tokenStartLine}, nil + } + tok.WriteString(r) + case inVar: + tok.WriteString(r) + if r != "}" && !isSpace(r) { + continue + } + lexState = inWord + case inQuote: + if r == quote { + return Token{Text: tok.String(), Line: s.tokenStartLine, IsQuoted: true}, nil + } + if r == "\\"+quote { + r = quote + } + tok.WriteString(r) + } + } +} + +// ScannerExt is the interface that describes an extension for the [Scanner]. Scanner extensions enable scanning of +// configurations that contain syntaxes that do not follow the usual grammar. +type ScannerExt interface { + Tokenizer(s *SubScanner, matchedToken string) Tokenizer +} + +// ErrTokenizerDone is returned by [Tokenizer] when tokenization is complete. +var ErrTokenizerDone = errors.New("done") + +// Tokenizer is the interface that wraps the Next method. +// +// Next returns the next token scanned from the NGINX configuration or an error if the configuration cannot be +// tokenized. Return the special error, [ErrTokenizerDone] when finished tokenizing. +type Tokenizer interface { + Next() (Token, error) +} + +// LexerScanner is a compatibility layer between Lexers and Scanner. +type LexerScanner struct { + lexer Lexer + scanner *SubScanner + matchedToken string + ch <-chan NgxToken +} + +func (s *LexerScanner) Tokenizer(scanner *SubScanner, matchedtoken string) Tokenizer { + s.scanner = scanner + s.matchedToken = matchedtoken + return s +} + +func (s *LexerScanner) Next() (Token, error) { + if s.ch == nil { + s.ch = s.lexer.Lex(s.scanner, s.matchedToken) + } + + ngxTok, ok := <-s.ch + if !ok { + return Token{}, ErrTokenizerDone + } + + if ngxTok.Error != nil { + return Token{}, newScannerErrf(ngxTok.Line, ngxTok.Error.Error()) + } + + return Token{Text: ngxTok.Value, Line: ngxTok.Line, IsQuoted: ngxTok.IsQuoted}, nil +} diff --git a/scanner_test.go b/scanner_test.go new file mode 100644 index 00000000..7f2f5b1e --- /dev/null +++ b/scanner_test.go @@ -0,0 +1,144 @@ +package crossplane + +import ( + "errors" + "io" + "os" + "strings" + "testing" + + "github.com/stretchr/testify/require" +) + +func TestScanner(t *testing.T) { + t.Parallel() + + for _, f := range lexFixtures { + f := f + + t.Run(f.name, func(t *testing.T) { + t.Parallel() + + path := getTestConfigPath(f.name, "nginx.conf") + file, err := os.Open(path) + if err != nil { + t.Fatal(err) + } + defer file.Close() + + s := NewScanner(file, lua.RegisterLexer()) + + i := 0 + for { + got, err := s.Scan() + if err == io.EOF { + if i < len(f.tokens)-1 { + t.Fatal("unexpected end of file") + } + return + } + + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + + want := f.tokens[i] + require.Equal(t, want.value, got.Text, "got=%s", got) + require.Equal(t, want.line, got.Line, "got=%s", got) + i++ + } + }) + } +} + +func TestScanner_unhappy(t *testing.T) { + t.Parallel() + + for name, c := range unhappyFixtures { + c := c + t.Run(name, func(t *testing.T) { + t.Parallel() + + s := NewScanner(strings.NewReader(c), lua.RegisterLexer()) + for { + _, err := s.Scan() + if err == io.EOF { + t.Fatal("reached end of string") + } + + if err != nil { + t.Logf("got error: %v", err) + + if gotErr := s.Err(); !errors.Is(gotErr, err) { + t.Fatalf("error do not match: have=%+v, want=%+v", gotErr, err) + } + + if _, gotErr := s.Scan(); !errors.Is(gotErr, err) { + t.Fatalf("error after scan does not match: have=%+v, want=%+v", gotErr, err) + } + + break + } + } + }) + } +} + +func benchmarkScanner(b *testing.B, path string, options ...ScannerOption) { + var t Token + + file, err := os.Open(path) + if err != nil { + b.Fatal(err) + } + defer file.Close() + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + if _, err := file.Seek(0, 0); err != nil { + b.Fatal(err) + } + + s := NewScanner(file, options...) + + for { + tok, err := s.Scan() + if err == io.EOF { + break + } + if err != nil { + b.Fatal(err) + } + t = tok + } + } + + _ = t +} + +func BenchmarkScanner(b *testing.B) { + for _, bm := range lexFixtures { + if strings.HasPrefix(bm.name, "lua") { + continue + } + + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path) + }) + } +} + +func BenchmarkScannerWithLua(b *testing.B) { + for _, bm := range lexFixtures { + if !strings.HasPrefix(bm.name, "lua") { + continue + } + + b.Run(bm.name, func(b *testing.B) { + path := getTestConfigPath(bm.name, "nginx.conf") + benchmarkScanner(b, path, lua.RegisterLexer()) + }) + } +} diff --git a/util.go b/util.go index d2e84ade..186afeb5 100644 --- a/util.go +++ b/util.go @@ -35,6 +35,8 @@ func isEOL(s string) bool { return strings.HasSuffix(s, "\n") } +func isSpecialChar(s string) bool { return s == "{" || s == "}" || s == ";" } + func repr(s string) string { q := fmt.Sprintf("%q", s) for _, char := range s {