From 93610ce779714f740c7ec03c225f6a3c1321c26c Mon Sep 17 00:00:00 2001 From: Colin E Date: Sun, 24 Jan 2021 20:44:03 +0000 Subject: [PATCH 1/7] Move character set parsing logic into the parser --- assembly/char.ts | 3 +- assembly/nfa/matcher.ts | 67 +++++++++++++++++++++++++++------------ assembly/parser/node.ts | 22 +++++++++++-- assembly/parser/parser.ts | 40 ++++++++++++++++------- assembly/regexp.ts | 36 ++++++++++++++++++++- ts/index.ts | 4 +-- 6 files changed, 133 insertions(+), 39 deletions(-) diff --git a/assembly/char.ts b/assembly/char.ts index 9505081..c6f6a65 100644 --- a/assembly/char.ts +++ b/assembly/char.ts @@ -9,8 +9,9 @@ export const enum Char { LeftParenthesis = 0x28, RightParenthesis = 0x29, Asterisk = 0x2a, // "*" - Comma = 0x2c, // "*" Plus = 0x2b, // "+" + Comma = 0x2c, // "*" + Minus = 0x2d, // "-" Dot = 0x2e, // "." Zero = 0x30, Question = 0x3f, // "?" diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts index c9be125..1c3b648 100644 --- a/assembly/nfa/matcher.ts +++ b/assembly/nfa/matcher.ts @@ -4,10 +4,14 @@ import { CharacterNode, CharacterSetNode, CharacterClassNode, + CharacterRangeNode, } from "../parser/node"; +import { Match } from "../regexp"; -export abstract class Matcher { - abstract matches(code: u32): bool; +export class Matcher { + matches(code: u32): bool { + return false; + } static fromCharacterClassNode( node: CharacterClassNode @@ -15,8 +19,27 @@ export abstract class Matcher { return new CharacterClassMatcher(node.charClass); } + static fromCharacterRangeNode( + node: CharacterRangeNode + ): CharacterRangeMatcher { + return new CharacterRangeMatcher(node.from, node.to); + } + static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher { - return new CharacterSetMatcher(node.chars, node.negated); + const matchers = new Array(); + for (let i = 0; i < node.expressions.length; i++) { + const exp = node.expressions[i]; + if (CharacterRangeNode.is(exp)) { + matchers.push( + Matcher.fromCharacterRangeNode(exp as CharacterRangeNode) + ); + } else if (CharacterNode.is(exp)) { + matchers.push(Matcher.fromCharacterNode(exp as CharacterNode)); + } else { + throw new Error("unsupported node type within character set"); + } + } + return new CharacterSetMatcher(matchers, node.negated); } static fromCharacterNode(node: CharacterNode): CharacterMatcher { @@ -34,6 +57,16 @@ export class CharacterMatcher extends Matcher { } } +export class CharacterRangeMatcher extends Matcher { + constructor(public from: u32, public to: u32) { + super(); + } + + matches(code: u32): bool { + return code >= this.from && code <= this.to; + } +} + export class CharacterClassMatcher extends Matcher { constructor(public charClass: Char) { super(); @@ -79,28 +112,20 @@ export class CharacterClassMatcher extends Matcher { } } +// no closure support +let _code: u32; + export class CharacterSetMatcher extends Matcher { - constructor(public set: string, public negated: bool) { + constructor(public matchers: Matcher[], public negated: bool) { super(); } - matchesSet(set: string, code: u32): bool { - for (let i = 0, len = set.length; i < len; i++) { - // TODO - perform the set parsing logic in the constructor? - // TODO - move into the parser? - if (i < len - 2 && set.charCodeAt(i + 1) == 45 /*-*/) { - const from = set.charCodeAt(i) as u32; - const to = set.charCodeAt(i + 2) as u32; - if (code >= from && code <= to) return true; - } else { - if (set.charCodeAt(i) == code) return true; - } - } - return false; - } - matches(code: u32): bool { - const matches = this.matchesSet(this.set, code); - return this.negated ? !matches : matches; + _code = code; + if (!this.negated) { + return this.matchers.some((m) => m.matches(_code)); + } else { + return !this.matchers.some((m) => m.matches(_code)); + } } } diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts index fce9d56..dbc1ec0 100644 --- a/assembly/parser/node.ts +++ b/assembly/parser/node.ts @@ -9,6 +9,7 @@ export const enum NodeType { Character, CharacterSet, CharacterClass, + CharacterRange, Repetition, RangeRepetition, Group, @@ -71,12 +72,29 @@ export class ConcatenationNode extends Node { } export class CharacterSetNode extends Node { - constructor(public chars: string, public negated: bool) { + constructor(public expressions: Node[], public negated: bool) { super(NodeType.CharacterSet); } clone(): Node { - return new CharacterSetNode(this.chars, this.negated); + return new CharacterSetNode( + this.expressions.slice(0).map((s) => s.clone()), + this.negated + ); + } +} + +export class CharacterRangeNode extends Node { + constructor(public from: u32, public to: u32) { + super(NodeType.CharacterRange); + } + + static is(node: Node): bool { + return node.type == NodeType.CharacterRange; + } + + clone(): Node { + return new CharacterRangeNode(this.from, this.to); } } diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index f0d5c43..43a9321 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -11,6 +11,7 @@ import { ConcatenationNode, RepetitionNode, CharacterSetNode, + CharacterRangeNode, } from "./node"; function isQuantifier(code: Char): bool { @@ -117,7 +118,7 @@ export class Parser { let firstDigit = true; let digitStr = ""; while (this.more()) { - let token = this.currentToken.charCodeAt(0); + const token = this.currentToken.charCodeAt(0); if (token == Char.RightParenthesis) break; if (firstDigit) { if (isDigit(token)) { @@ -170,7 +171,7 @@ export class Parser { private parseSequence(): Node { let nodes = new Array(); while (this.more()) { - let token = this.currentToken.charCodeAt(0); + const token = this.currentToken.charCodeAt(0); if (token == Char.RightParenthesis) break; // @ts-ignore if (token == Char.VerticalBar) { @@ -207,23 +208,38 @@ export class Parser { return nodes.length > 1 ? new ConcatenationNode(nodes) : nodes[0]; } + private parseCharacterRange(): Node { + const from = this.eatToken(); + this.eatToken(Char.Minus); + const to = this.eatToken(); + return new CharacterRangeNode(from, to); + } + private parseCharacterSet(): CharacterSetNode { - let chars = ""; this.eatToken(Char.LeftSquareBracket); - const negated = this.currentToken == "^"; + const token = this.currentToken.charCodeAt(0); + + const negated = token == Char.Caret; if (negated) { this.eatToken(Char.Caret); } - while ( - this.currentToken != "]" || - (chars.length == 0 && this.currentToken == "]") - ) { - // TODO characters set can contain character classes - chars += this.currentToken; - this.eatToken(); + + const nodes = new Array(); + while (this.currentToken != "]" || nodes.length == 0) { + // lookahead for character range + if ( + this.cursor + 1 < u32(this.input.length) && + this.input.charCodeAt(this.cursor + 1) == Char.Minus && + this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket + ) { + nodes.push(this.parseCharacterRange()); + } else { + nodes.push(this.parseCharacter()); + } + // TODO error if we run out of chars? } this.eatToken(Char.RightSquareBracket); - return new CharacterSetNode(chars, negated); + return new CharacterSetNode(nodes, negated); } } diff --git a/assembly/regexp.ts b/assembly/regexp.ts index 15ac55d..308fa75 100644 --- a/assembly/regexp.ts +++ b/assembly/regexp.ts @@ -1,10 +1,17 @@ -import { State, Automata, toNFAFromAST, GroupEndMarkerState } from "./nfa/nfa"; +import { + State, + Automata, + toNFAFromAST, + GroupEndMarkerState, + MatcherState, +} from "./nfa/nfa"; import { walker as nfaWalker } from "./nfa/walker"; import { ConcatenationNode, AssertionNode, NodeType } from "./parser/node"; import { Char } from "./char"; import { Parser } from "./parser/parser"; import { first, last } from "./util"; import { walker as astWalker, expandRepetitions } from "./parser/walker"; +import { CharacterMatcher, CharacterSetMatcher, Matcher } from "./nfa/matcher"; function recursiveBacktrackingSearch( state: State, @@ -145,5 +152,32 @@ export class RegExp { // TODO: do we need this factory function, or can we invoke // the ctr via the loader? export function createRegExp(regex: string, flags: string): RegExp { + + /* ---------------- */ + /* + This block of code is needed to avoid the following runtime error ... + + RuntimeError: unreachable + at assembly/nfa/matcher/Matcher#matches@virtual (wasm-function[240]:1) + at assembly/nfa/matcher/CharacterSetMatcher#matches~anonymous|0 (wasm-function[241]:19) + at ~lib/array/Array#some (wasm-function[242]:85) + at assembly/nfa/matcher/CharacterSetMatcher#matches (wasm-function[244]:21) + at assembly/nfa/nfa/MatcherState#matches (wasm-function[245]:8) + at assembly/nfa/nfa/State#matches@virtual (wasm-function[250]:58) + at assembly/regexp/recursiveBacktrackingSearch (wasm-function[184]:121) + at assembly/regexp/recursiveBacktrackingSearch@varargs (wasm-function[185]:56) + at assembly/regexp/RegExp#exec (wasm-function[192]:307) + */ + const matchers = new Array(); + matchers.push(new CharacterMatcher(Char.A)); + const charMatcher = new CharacterSetMatcher(matchers, false); + const state = new MatcherState( + charMatcher, + new State(true) + ); + const char = "a".charCodeAt(0); + const doesMatch = state.matches(char) != null; + /* ---------------- */ + return new RegExp(regex, flags); } diff --git a/ts/index.ts b/ts/index.ts index c14ab9d..fe02757 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,7 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("(a*)+"); -const match = regexObj.exec("-"); +const regexObj = new RegExp("[]a]"); +const match = regexObj.exec("]"); console.log(match); From 744a721ce8aad0c7da960e54e0c98cc1bde1f10a Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 07:50:14 +0000 Subject: [PATCH 2/7] small refactor --- assembly/nfa/matcher.ts | 12 ++++-------- assembly/regexp.ts | 1 - 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts index 1c3b648..9c32f0c 100644 --- a/assembly/nfa/matcher.ts +++ b/assembly/nfa/matcher.ts @@ -26,19 +26,15 @@ export class Matcher { } static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher { - const matchers = new Array(); - for (let i = 0; i < node.expressions.length; i++) { - const exp = node.expressions[i]; + const matchers = node.expressions.map((exp) => { if (CharacterRangeNode.is(exp)) { - matchers.push( - Matcher.fromCharacterRangeNode(exp as CharacterRangeNode) - ); + return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode); } else if (CharacterNode.is(exp)) { - matchers.push(Matcher.fromCharacterNode(exp as CharacterNode)); + return Matcher.fromCharacterNode(exp as CharacterNode); } else { throw new Error("unsupported node type within character set"); } - } + }); return new CharacterSetMatcher(matchers, node.negated); } diff --git a/assembly/regexp.ts b/assembly/regexp.ts index 308fa75..4c99f06 100644 --- a/assembly/regexp.ts +++ b/assembly/regexp.ts @@ -152,7 +152,6 @@ export class RegExp { // TODO: do we need this factory function, or can we invoke // the ctr via the loader? export function createRegExp(regex: string, flags: string): RegExp { - /* ---------------- */ /* This block of code is needed to avoid the following runtime error ... From f9bc2f4dced9255865603a8d63e04b1163675f3a Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 11:19:36 +0000 Subject: [PATCH 3/7] test code refactor --- {test => __spec_tests__}/data.test.js | 4 +- {test => __spec_tests__}/test.dat | 0 __tests__/alternations.js | 15 ++ __tests__/boundary-assertions.js | 19 ++ __tests__/capture-groups.js | 35 +++ __tests__/character-classes.js | 50 ++++ __tests__/character-sets.js | 41 ++++ __tests__/characters.js | 11 + __tests__/index.js | 71 ++++++ __tests__/quantifiers.js | 38 +++ __tests__/range-quantifiers.js | 27 +++ {test => __tests__}/util.js | 31 ++- package.json | 4 +- test/index.test.js | 337 -------------------------- 14 files changed, 340 insertions(+), 343 deletions(-) rename {test => __spec_tests__}/data.test.js (92%) rename {test => __spec_tests__}/test.dat (100%) create mode 100644 __tests__/alternations.js create mode 100644 __tests__/boundary-assertions.js create mode 100644 __tests__/capture-groups.js create mode 100644 __tests__/character-classes.js create mode 100644 __tests__/character-sets.js create mode 100644 __tests__/characters.js create mode 100644 __tests__/index.js create mode 100644 __tests__/quantifiers.js create mode 100644 __tests__/range-quantifiers.js rename {test => __tests__}/util.js (71%) delete mode 100644 test/index.test.js diff --git a/test/data.test.js b/__spec_tests__/data.test.js similarity index 92% rename from test/data.test.js rename to __spec_tests__/data.test.js index 0267f97..acb680e 100644 --- a/test/data.test.js +++ b/__spec_tests__/data.test.js @@ -1,8 +1,8 @@ -const { RegExp } = require("./util"); +const { RegExp } = require("../__tests__/util"); const fs = require("fs"); const { fail } = require("assert"); -const data = fs.readFileSync("./test/test.dat", "utf8"); +const data = fs.readFileSync("./__spec_tests__/test.dat", "utf8"); const lines = data.split("\n"); const matches = (regex, value) => { diff --git a/test/test.dat b/__spec_tests__/test.dat similarity index 100% rename from test/test.dat rename to __spec_tests__/test.dat diff --git a/__tests__/alternations.js b/__tests__/alternations.js new file mode 100644 index 0000000..fe1fd62 --- /dev/null +++ b/__tests__/alternations.js @@ -0,0 +1,15 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("or", () => { + expectMatch("a|b", ["b", "a"]); + expectNotMatch("a|b", ["c"]); + expectMatch("a|br", ["br", "a"]); + expectNotMatch("a|br", ["b", "c"]); +}); + +it("or multi-term", () => { + expectMatch("a|b|c", ["b", "a", "c"]); + expectNotMatch("a|b|c", ["d"]); + expectMatch("a|br|pc", ["br", "a", "pc"]); + expectNotMatch("a|br|pc", ["b", "pr"]); +}); diff --git a/__tests__/boundary-assertions.js b/__tests__/boundary-assertions.js new file mode 100644 index 0000000..ccff481 --- /dev/null +++ b/__tests__/boundary-assertions.js @@ -0,0 +1,19 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("matches end of string", () => { + const regex = new RegExp("a$"); + const match = regex.exec("ba"); + expect(match.index).toEqual(1); + expect(match.matches[0]).toEqual("a"); + expectNotMatch("a$", ["ab"]); +}); + +it("matches start of string", () => { + expectMatch("^a", ["a"]); + expectNotMatch("^a", ["ba"]); +}); + +it("handles escaped boundaries", () => { + expectMatch("\\^a", ["^a"]); + expectMatch("a\\$", ["a$"]); +}); diff --git a/__tests__/capture-groups.js b/__tests__/capture-groups.js new file mode 100644 index 0000000..91d57dd --- /dev/null +++ b/__tests__/capture-groups.js @@ -0,0 +1,35 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("supports capture groups", () => { + let match = matches("a(\\d)a", "a3a"); + expect(match.index).toEqual(0); + expect(match.input).toEqual("a3a"); + expect(match.matches[0]).toEqual("a3a"); + expect(match.matches[1]).toEqual("3"); + + match = matches("a(\\d)a", " a3a"); + expect(match.index).toEqual(2); + expect(match.input).toEqual(" a3a"); + expect(match.matches[0]).toEqual("a3a"); + expect(match.matches[1]).toEqual("3"); + + match = matches("a(\\d*)a", "a3456a"); + expect(match.index).toEqual(0); + expect(match.input).toEqual("a3456a"); + expect(match.matches[0]).toEqual("a3456a"); + expect(match.matches[1]).toEqual("3456"); + + match = matches("a*(\\d*)(a*)", "aaa456aaa"); + expect(match.index).toEqual(0); + expect(match.input).toEqual("aaa456aaa"); + expect(match.matches[0]).toEqual("aaa456aaa"); + expect(match.matches[1]).toEqual("456"); + expect(match.matches[2]).toEqual("aaa"); +}); + +it.skip("should not return captured values for non-matching alternations", () => { + const match = matches("(a|b)c|a(b|c)", "ab"); + expect(match.matches[0]).toEqual("ab"); + expect(match.matches[1]).toEqual(""); + expect(match.matches[2]).toEqual("b"); +}); diff --git a/__tests__/character-classes.js b/__tests__/character-classes.js new file mode 100644 index 0000000..21e88fe --- /dev/null +++ b/__tests__/character-classes.js @@ -0,0 +1,50 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("dot", () => { + expectMatch(".", [" ", "B", "|", "9"]); + expectNotMatch(".", ["", "\n"]); +}); + +it("digit", () => { + expectMatch("\\d", ["0", "9"]); + expectNotMatch("\\d", ["", "b"]); +}); + +it("non-digit", () => { + expectNotMatch("\\D", ["0", "9", ""]); + expectMatch("\\D", ["b", "|"]); +}); + +it("word", () => { + expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]); + expectNotMatch("\\w", ["", "$"]); +}); + +it("not word", () => { + expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]); + expectMatch("\\W", ["&", "$"]); +}); + +it("whitespace", () => { + expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]); + expectNotMatch("\\s", ["", "a", "0"]); +}); + +it("not whitespace", () => { + expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]); + expectMatch("\\S", ["a", "0"]); +}); + +it("tab, cr, lf, vt, ff", () => { + expectMatch("\\t", ["\t"]); + expectMatch("\\r", ["\r"]); + expectMatch("\\n", ["\n"]); + expectMatch("\\v", ["\v"]); + expectMatch("\\f", ["\f"]); + expectNotMatch("\\t", ["a", " ", ""]); +}); + +it("escaped dot", () => { + expectMatch("\\.", ["."]); + expectNotMatch("\\.", ["", "a"]); +}); diff --git a/__tests__/character-sets.js b/__tests__/character-sets.js new file mode 100644 index 0000000..74384be --- /dev/null +++ b/__tests__/character-sets.js @@ -0,0 +1,41 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("matches discrete characters", () => { + expectMatch("[abce]", ["a", "b", "c", "e"]); + expectNotMatch("[abce]", ["", "f", "h"]); +}); + +it("matches character ranges", () => { + expectMatch("[a-c]", ["a", "b", "c"]); + expectNotMatch("[a-c]", ["d", "e", ""]); + expectMatch("[K-M]", ["K", "L", "M"]); + expectNotMatch("[K-M]", ["9", "J"]); + expectMatch("[0-9]", ["0", "9"]); + expectNotMatch("[0-9]", ["a", "A"]); +}); + +it("matches multiple ranges", () => { + expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]); + expectNotMatch("[a-ce-f]", ["d"]); +}); + +it("supports closing brackets", () => { + expectMatch("[]a]", ["]", "a"]); +}); + +it("supports negated sets", () => { + expectNotMatch("[^a-c]", ["a", "b", "c"]); + expectMatch("[^a-c]", ["d", "e"]); + expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]); + expectMatch("[^a-ce-f]", ["d"]); +}); + +it("treats - as a literal", () => { + expectMatch("[-abc]", ["-", "a", "b", "c"]); + expectMatch("[abc-]", ["-", "a", "b", "c"]); +}); + +it("treats - as a literal in negated sets", () => { + expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); + expectMatch("[^-abc]", ["1", "A"]); +}); diff --git a/__tests__/characters.js b/__tests__/characters.js new file mode 100644 index 0000000..28e37f5 --- /dev/null +++ b/__tests__/characters.js @@ -0,0 +1,11 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("single character", () => { + expectMatch("a", ["a"]); + expectNotMatch("a", ["fish", ""]); +}); + +it("concatenation", () => { + expectMatch("ab", ["ab"]); + expectNotMatch("ab", ["aac", "aa", ""]); +}); diff --git a/__tests__/index.js b/__tests__/index.js new file mode 100644 index 0000000..ea9a55c --- /dev/null +++ b/__tests__/index.js @@ -0,0 +1,71 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +describe("regexp", () => { + it("match returns correct substring", () => { + const match = matches("\\d", "asd123asd"); + expect(match.index).toEqual(3); + expect(match.input).toEqual("asd123asd"); + expect(match.matches[0]).toEqual("1"); + }); + + describe("global mode", () => { + it("increments lastIndex", () => { + const regex = new RegExp("\\d+", "g"); + const match = regex.exec("dog 23 fish 45 cat"); + expect(match.matches[0]).toEqual("23"); + expect(regex.lastIndex).toEqual(6); + }); + + it("uses lastIndex to support multiple matches", () => { + const regex = new RegExp("\\d+", "g"); + + let match = regex.exec("dog 23 fish 45 cat"); + expect(match.matches[0]).toEqual("23"); + expect(regex.lastIndex).toEqual(6); + + match = regex.exec("dog 23 fish 45 cat"); + expect(match.matches[0]).toEqual("45"); + expect(regex.lastIndex).toEqual(14); + + match = regex.exec("dog 23 fish 45 cat"); + expect(match).toBeNull(); + expect(regex.lastIndex).toEqual(0); + }); + }); + + describe("non-global mode", () => { + it("doesn't increment lastIndex", () => { + const regex = new RegExp("\\d+"); + + let match = regex.exec("dog 23 fish 45 cat"); + expect(match.matches[0]).toEqual("23"); + expect(regex.lastIndex).toEqual(0); + + match = regex.exec("dog 23 fish 45 cat"); + expect(match.matches[0]).toEqual("23"); + expect(regex.lastIndex).toEqual(0); + }); + }); +}); + +describe("use cases", () => { + it("matches combinations", () => { + expectMatch("\\s\\w*", [" bar"]); + expectMatch("\\S\\w*", ["foo"]); + }); + + it("email", () => { + const regex = ".+@.+\\..+"; + expect(matches(regex, "colin@gmail.com")).toBeTruthy(); + expect(matches(regex, "gmail")).toBeFalsy(); + + const capturingRegex = "(.+)@(.+)\\.(.+)"; + expect(matches(capturingRegex, "colin@gmail.com")).toBeTruthy(); + + match = matches(capturingRegex, "colin@gmail.com"); + expect(match.matches[0]).toEqual("colin@gmail.com"); + expect(match.matches[1]).toEqual("colin"); + expect(match.matches[2]).toEqual("gmail"); + expect(match.matches[3]).toEqual("com"); + }); +}); diff --git a/__tests__/quantifiers.js b/__tests__/quantifiers.js new file mode 100644 index 0000000..c5d5cf4 --- /dev/null +++ b/__tests__/quantifiers.js @@ -0,0 +1,38 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("matches empty strings", () => { + expectMatch("a?", [""]); + expectMatch("a*", [""]); +}); + +it("zero or one", () => { + expectMatch("a?", ["a"]); + // expectNotMatch("a?", ["bc"]); +}); + +it("one or more", () => { + expectMatch("a+", ["a", "aa"]); + expectNotMatch("a+", [""]); +}); + +it("zero or more", () => { + expectMatch("a*", ["aa", "aaaa"]); +}); + +it("multiple rules", () => { + expectMatch("a*b", ["b", "ab", "aaaab"]); + expectNotMatch("a*b", ["aaaad"]); +}); + +it("zero or more is greedy", () => { + let match = matches("a*", "aaaaa"); + expect(match).not.toBeNull(); + expect(match.matches[0]).toEqual("aaaaa"); +}); + +it("one or more is greedy", () => { + let match = matches("a+", "aaaaa"); + console.log(match); + expect(match).not.toBeNull(); + expect(match.matches[0]).toEqual("aaaaa"); +}); diff --git a/__tests__/range-quantifiers.js b/__tests__/range-quantifiers.js new file mode 100644 index 0000000..539122d --- /dev/null +++ b/__tests__/range-quantifiers.js @@ -0,0 +1,27 @@ +const { RegExp, expectNotMatch, expectMatch, matches } = require("./util"); + +it("handles single quantifier", () => { + expectMatch("a{2}", ["aa"]); + expectMatch("ba{2}", ["baa"]); + expectMatch("ba{1}b", ["bab"]); +}); + +it("handles open upper bound quantifiers", () => { + expectMatch("a{2,}", ["aa", "aaaaa"]); + expectMatch("ba{2,}", ["baa", "baaaaaaa"]); + expectMatch("ba{1,}b", ["bab", "baaaaaab"]); +}); + +it("handles explicit upper bound quantifiers", () => { + const match = matches("a{2,4}", "aaaaaaaaaa"); + expect(match.matches[0]).toEqual("aaaa"); +}); + +it("handles zero value quantifier", () => { + expectMatch("ba{0}b", ["bb"]); +}); + +it("handles quantifiers within alternates", () => { + expectMatch("a{2}|b{2}", ["bb", "aa"]); + expectNotMatch("a{2}|b{2}", ["cc"]); +}); diff --git a/test/util.js b/__tests__/util.js similarity index 71% rename from test/util.js rename to __tests__/util.js index bf6b154..a7a086a 100644 --- a/test/util.js +++ b/__tests__/util.js @@ -2,7 +2,7 @@ global.TextDecoder = require("text-encoding").TextDecoder; const fs = require("fs"); const loader = require("@assemblyscript/loader/umd/index"); -class RegExpProxy { +class RegExp { constructor(regex, flags = "") { this.wasmModule = loader.instantiateSync( fs.readFileSync("./build/untouched.wasm"), @@ -80,4 +80,31 @@ class RegExpProxy { } } -module.exports.RegExp = RegExpProxy; +const expectMatch = (regex, arr) => { + arr.forEach((value) => { + const regexp = new RegExp(regex); + const match = regexp.exec(value); + expect(match).not.toBeNull(); + expect(match.matches[0]).toEqual(value); + }); +}; + +const expectNotMatch = (regex, arr) => { + arr.forEach((value) => { + const regexp = new RegExp(regex); + const match = regexp.exec(value); + expect(match).toBeNull(); + }); +}; + +const matches = (regex, value) => { + const regexp = new RegExp(regex); + return regexp.exec(value); +}; + +test.todo("no tests in this file!"); + +module.exports.RegExp = RegExp; +module.exports.matches = matches; +module.exports.expectNotMatch = expectNotMatch; +module.exports.expectMatch = expectMatch; diff --git a/package.json b/package.json index 7e5a3e7..1ade2c1 100644 --- a/package.json +++ b/package.json @@ -4,8 +4,8 @@ "description": "A regex engine built with AssemblyScript", "main": "index.js", "scripts": { - "test": "npm run asbuild:untouched && npm run prettier:check && jest test/index.test.js", - "test:suite": "npm run asbuild:untouched && jest test/data.test.js --reporter=jest-summary-reporter", + "test": "npm run asbuild:untouched && npm run prettier:check && jest __tests__", + "test:suite": "npm run asbuild:untouched && jest __spec_tests__ --reporter=jest-summary-reporter", "prettier:check": "prettier --check .", "prettier:write": "prettier --write .", "asbuild:untouched": "asc assembly/index.ts --target debug", diff --git a/test/index.test.js b/test/index.test.js deleted file mode 100644 index 4f0ebcc..0000000 --- a/test/index.test.js +++ /dev/null @@ -1,337 +0,0 @@ -const { RegExp } = require("./util"); - -const expectMatch = (regex, arr) => { - arr.forEach((value) => { - const regexp = new RegExp(regex); - const match = regexp.exec(value); - expect(match).not.toBeNull(); - expect(match.matches[0]).toEqual(value); - }); -}; - -const expectNotMatch = (regex, arr) => { - arr.forEach((value) => { - const regexp = new RegExp(regex); - const match = regexp.exec(value); - expect(match).toBeNull(); - }); -}; - -const matches = (regex, value) => { - const regexp = new RegExp(regex); - return regexp.exec(value); -}; - -describe("Characters", () => { - it("single character", () => { - expectMatch("a", ["a"]); - expectNotMatch("a", ["fish", ""]); - }); - - it("concatenation", () => { - expectMatch("ab", ["ab"]); - expectNotMatch("ab", ["aac", "aa", ""]); - }); -}); - -describe("Quantifiers", () => { - it("matches empty strings", () => { - expectMatch("a?", [""]); - expectMatch("a*", [""]); - }); - - it("zero or one", () => { - expectMatch("a?", ["a"]); - // expectNotMatch("a?", ["bc"]); - }); - - it("one or more", () => { - expectMatch("a+", ["a", "aa"]); - expectNotMatch("a+", [""]); - }); - - it("zero or more", () => { - expectMatch("a*", ["aa", "aaaa"]); - }); - - it("multiple rules", () => { - expectMatch("a*b", ["b", "ab", "aaaab"]); - expectNotMatch("a*b", ["aaaad"]); - }); - - it("zero or more is greedy", () => { - let match = matches("a*", "aaaaa"); - expect(match).not.toBeNull(); - expect(match.matches[0]).toEqual("aaaaa"); - }); - - it("one or more is greedy", () => { - let match = matches("a+", "aaaaa"); - console.log(match); - expect(match).not.toBeNull(); - expect(match.matches[0]).toEqual("aaaaa"); - }); -}); - -describe("Groups and ranges", () => { - it("or", () => { - expectMatch("a|b", ["b", "a"]); - expectNotMatch("a|b", ["c"]); - expectMatch("a|br", ["br", "a"]); - expectNotMatch("a|br", ["b", "c"]); - }); - - it("or multi-term", () => { - expectMatch("a|b|c", ["b", "a", "c"]); - expectNotMatch("a|b|c", ["d"]); - expectMatch("a|br|pc", ["br", "a", "pc"]); - expectNotMatch("a|br|pc", ["b", "pr"]); - }); -}); - -describe("character sets", () => { - it("matches discrete characters", () => { - expectMatch("[abce]", ["a", "b", "c", "e"]); - expectNotMatch("[abce]", ["", "f", "h"]); - }); - - it("matches character ranges", () => { - expectMatch("[a-c]", ["a", "b", "c"]); - expectNotMatch("[a-c]", ["d", "e", ""]); - expectMatch("[K-M]", ["K", "L", "M"]); - expectNotMatch("[K-M]", ["9", "J"]); - expectMatch("[0-9]", ["0", "9"]); - expectNotMatch("[0-9]", ["a", "A"]); - }); - - it("matches multiple ranges", () => { - expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]); - expectNotMatch("[a-ce-f]", ["d"]); - }); - - it("supports closing brackets", () => { - expectMatch("[]a]", ["]", "a"]); - }); - - it("supports negated sets", () => { - expectNotMatch("[^a-c]", ["a", "b", "c"]); - expectMatch("[^a-c]", ["d", "e"]); - expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]); - expectMatch("[^a-ce-f]", ["d"]); - }); - - it("treats - as a literal", () => { - expectMatch("[-abc]", ["-", "a", "b", "c"]); - expectMatch("[abc-]", ["-", "a", "b", "c"]); - }); - - it("treats - as a literal in negated sets", () => { - expectNotMatch("[^-abc]", ["-", "a", "b", "c"]); - expectMatch("[^-abc]", ["1", "A"]); - }); -}); - -describe("character classes", () => { - it("dot", () => { - expectMatch(".", [" ", "B", "|", "9"]); - expectNotMatch(".", ["", "\n"]); - }); - - it("digit", () => { - expectMatch("\\d", ["0", "9"]); - expectNotMatch("\\d", ["", "b"]); - }); - - it("non-digit", () => { - expectNotMatch("\\D", ["0", "9", ""]); - expectMatch("\\D", ["b", "|"]); - }); - - it("word", () => { - expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]); - expectNotMatch("\\w", ["", "$"]); - }); - - it("not word", () => { - expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]); - expectMatch("\\W", ["&", "$"]); - }); - - it("whitespace", () => { - expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]); - expectNotMatch("\\s", ["", "a", "0"]); - }); - - it("not whitespace", () => { - expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]); - expectMatch("\\S", ["a", "0"]); - }); - - it("tab, cr, lf, vt, ff", () => { - expectMatch("\\t", ["\t"]); - expectMatch("\\r", ["\r"]); - expectMatch("\\n", ["\n"]); - expectMatch("\\v", ["\v"]); - expectMatch("\\f", ["\f"]); - expectNotMatch("\\t", ["a", " ", ""]); - }); - - it("escaped dot", () => { - expectMatch("\\.", ["."]); - expectNotMatch("\\.", ["", "a"]); - }); -}); - -describe("boundary assertions", () => { - it("matches end of string", () => { - const regex = new RegExp("a$"); - const match = regex.exec("ba"); - expect(match.index).toEqual(1); - expect(match.matches[0]).toEqual("a"); - expectNotMatch("a$", ["ab"]); - }); - - it("matches start of string", () => { - expectMatch("^a", ["a"]); - expectNotMatch("^a", ["ba"]); - }); - - it("handles escaped boundaries", () => { - expectMatch("\\^a", ["^a"]); - expectMatch("a\\$", ["a$"]); - }); -}); - -describe("regexp", () => { - it("match returns correct substring", () => { - const match = matches("\\d", "asd123asd"); - expect(match.index).toEqual(3); - expect(match.input).toEqual("asd123asd"); - expect(match.matches[0]).toEqual("1"); - }); - - describe("global mode", () => { - it("increments lastIndex", () => { - const regex = new RegExp("\\d+", "g"); - const match = regex.exec("dog 23 fish 45 cat"); - expect(match.matches[0]).toEqual("23"); - expect(regex.lastIndex).toEqual(6); - }); - - it("uses lastIndex to support multiple matches", () => { - const regex = new RegExp("\\d+", "g"); - - let match = regex.exec("dog 23 fish 45 cat"); - expect(match.matches[0]).toEqual("23"); - expect(regex.lastIndex).toEqual(6); - - match = regex.exec("dog 23 fish 45 cat"); - expect(match.matches[0]).toEqual("45"); - expect(regex.lastIndex).toEqual(14); - - match = regex.exec("dog 23 fish 45 cat"); - expect(match).toBeNull(); - expect(regex.lastIndex).toEqual(0); - }); - }); - - describe("non-global mode", () => { - it("doesn't increment lastIndex", () => { - const regex = new RegExp("\\d+"); - - let match = regex.exec("dog 23 fish 45 cat"); - expect(match.matches[0]).toEqual("23"); - expect(regex.lastIndex).toEqual(0); - - match = regex.exec("dog 23 fish 45 cat"); - expect(match.matches[0]).toEqual("23"); - expect(regex.lastIndex).toEqual(0); - }); - }); -}); - -describe("capture groups", () => { - it("supports capture groups", () => { - let match = matches("a(\\d)a", "a3a"); - expect(match.index).toEqual(0); - expect(match.input).toEqual("a3a"); - expect(match.matches[0]).toEqual("a3a"); - expect(match.matches[1]).toEqual("3"); - - match = matches("a(\\d)a", " a3a"); - expect(match.index).toEqual(2); - expect(match.input).toEqual(" a3a"); - expect(match.matches[0]).toEqual("a3a"); - expect(match.matches[1]).toEqual("3"); - - match = matches("a(\\d*)a", "a3456a"); - expect(match.index).toEqual(0); - expect(match.input).toEqual("a3456a"); - expect(match.matches[0]).toEqual("a3456a"); - expect(match.matches[1]).toEqual("3456"); - - match = matches("a*(\\d*)(a*)", "aaa456aaa"); - expect(match.index).toEqual(0); - expect(match.input).toEqual("aaa456aaa"); - expect(match.matches[0]).toEqual("aaa456aaa"); - expect(match.matches[1]).toEqual("456"); - expect(match.matches[2]).toEqual("aaa"); - }); - - it.skip("should not return captured values for non-matching alternations", () => { - const match = matches("(a|b)c|a(b|c)", "ab"); - expect(match.matches[0]).toEqual("ab"); - expect(match.matches[1]).toEqual(""); - expect(match.matches[2]).toEqual("b"); - }); -}); - -describe("range quantifiers", () => { - it("handles single quantifier", () => { - expectMatch("a{2}", ["aa"]); - expectMatch("ba{2}", ["baa"]); - expectMatch("ba{1}b", ["bab"]); - }); - - it("handles open upper bound quantifiers", () => { - expectMatch("a{2,}", ["aa", "aaaaa"]); - expectMatch("ba{2,}", ["baa", "baaaaaaa"]); - expectMatch("ba{1,}b", ["bab", "baaaaaab"]); - }); - - it("handles explicit upper bound quantifiers", () => { - const match = matches("a{2,4}", "aaaaaaaaaa"); - expect(match.matches[0]).toEqual("aaaa"); - }); - - it("handles zero value quantifier", () => { - expectMatch("ba{0}b", ["bb"]); - }); - - it("handles quantifiers within alternates", () => { - expectMatch("a{2}|b{2}", ["bb", "aa"]); - expectNotMatch("a{2}|b{2}", ["cc"]); - }); -}); - -describe("use cases", () => { - it("matches combinations", () => { - expectMatch("\\s\\w*", [" bar"]); - expectMatch("\\S\\w*", ["foo"]); - }); - - it("email", () => { - const regex = ".+@.+\\..+"; - expect(matches(regex, "colin@gmail.com")).toBeTruthy(); - expect(matches(regex, "gmail")).toBeFalsy(); - - const capturingRegex = "(.+)@(.+)\\.(.+)"; - expect(matches(capturingRegex, "colin@gmail.com")).toBeTruthy(); - - match = matches(capturingRegex, "colin@gmail.com"); - expect(match.matches[0]).toEqual("colin@gmail.com"); - expect(match.matches[1]).toEqual("colin"); - expect(match.matches[2]).toEqual("gmail"); - expect(match.matches[3]).toEqual("com"); - }); -}); From 356ed85ed83e2edfe7e4d190672c67c7bb1b9e6c Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 14:16:19 +0000 Subject: [PATCH 4/7] character sets support escaping of special chars --- __tests__/character-sets.js | 11 +++++++++++ __tests__/quantifiers.js | 1 - assembly/parser/parser.ts | 25 ++++++++++++++++++++++--- package.json | 1 + ts/index.ts | 4 ++-- 5 files changed, 36 insertions(+), 6 deletions(-) diff --git a/__tests__/character-sets.js b/__tests__/character-sets.js index 74384be..44b16a6 100644 --- a/__tests__/character-sets.js +++ b/__tests__/character-sets.js @@ -5,6 +5,17 @@ it("matches discrete characters", () => { expectNotMatch("[abce]", ["", "f", "h"]); }); +it("throws an error if no closing bracket is found", () => { + expect(() => new RegExp("[abce")).toThrow(); +}); + +it("supports escaping of special characters", () => { + expectMatch("[a\\^b]", ["a", "b", "^"]); + expectMatch("[a\\-c]", ["a", "c", "-"]); + expectMatch("[a\\]]", ["a", "]"]); + expectMatch("[a\\\\b]", ["a", "\\"]); +}); + it("matches character ranges", () => { expectMatch("[a-c]", ["a", "b", "c"]); expectNotMatch("[a-c]", ["d", "e", ""]); diff --git a/__tests__/quantifiers.js b/__tests__/quantifiers.js index c5d5cf4..8c43686 100644 --- a/__tests__/quantifiers.js +++ b/__tests__/quantifiers.js @@ -32,7 +32,6 @@ it("zero or more is greedy", () => { it("one or more is greedy", () => { let match = matches("a+", "aaaaa"); - console.log(match); expect(match).not.toBeNull(); expect(match.matches[0]).toEqual("aaaaa"); }); diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index 43a9321..3c848d6 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -18,6 +18,16 @@ function isQuantifier(code: Char): bool { return code == Char.Question || code == Char.Plus || code == Char.Asterisk; } +// characters which have special meaning within character sets +function isCharacterSetSpecialChar(code: Char): bool { + return ( + code == Char.Caret || + code == Char.Minus || + code == Char.RightSquareBracket || + code == Char.Backslash + ); +} + function isAssertion(code: u32): bool { return code == Char.Dollar || code == Char.Caret; // "$" or "^" } @@ -228,16 +238,25 @@ export class Parser { while (this.currentToken != "]" || nodes.length == 0) { // lookahead for character range if ( - this.cursor + 1 < u32(this.input.length) && + this.cursor + 2 < u32(this.input.length) && + this.currentToken != "\\" && this.input.charCodeAt(this.cursor + 1) == Char.Minus && this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket ) { nodes.push(this.parseCharacterRange()); } else { - nodes.push(this.parseCharacter()); + if ( + this.currentToken == "\\" && + isCharacterSetSpecialChar(this.input.charCodeAt(this.cursor + 1)) + ) { + this.eatToken(Char.Backslash); + } + nodes.push(new CharacterNode(this.eatToken())); } - // TODO error if we run out of chars? + if (this.cursor >= u32(this.input.length)) { + throw new SyntaxError("Unterminated character class"); + } } this.eatToken(Char.RightSquareBracket); return new CharacterSetNode(nodes, negated); diff --git a/package.json b/package.json index 1ade2c1..59183b9 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,7 @@ "scripts": { "test": "npm run asbuild:untouched && npm run prettier:check && jest __tests__", "test:suite": "npm run asbuild:untouched && jest __spec_tests__ --reporter=jest-summary-reporter", + "jest": "jest __tests__", "prettier:check": "prettier --check .", "prettier:write": "prettier --write .", "asbuild:untouched": "asc assembly/index.ts --target debug", diff --git a/ts/index.ts b/ts/index.ts index fe02757..2d42465 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,7 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("[]a]"); -const match = regexObj.exec("]"); +const regexObj = new RegExp("[a\\\\c]"); +const match = regexObj.exec("\\"); console.log(match); From 417f7834542d632e4249272c2de755b9d86ccaf4 Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 15:44:10 +0000 Subject: [PATCH 5/7] Added string iterator concept to parser --- assembly/parser/parser.ts | 110 +++++++++++++++++++++++++------------- ts/index.ts | 4 +- 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index 3c848d6..be8db87 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -58,44 +58,78 @@ class Range { to: i32 = -1; } -export class Parser { - currentToken: string = ""; +class StringIterator { + current: u32; cursor: u32 = 0; - private constructor(public input: string) {} + constructor(private sourceString: string) { + this.current = this.sourceString.charCodeAt(0); + } - static toAST(input: string): AST { - return new Parser(input).toAST(); + lookahead(distance: u32): u32 { + return this.sourceString.charCodeAt(this.cursor + distance); } - private eatToken(value: u32 = -1): u32 { - const token = this.currentToken.charCodeAt(0) as u32; - if (value != -1 && token != value) { - throw new Error("invalid token"); + next(): bool { + this.cursor++; + if (this.cursor >= u32(this.sourceString.length)) { + return false; } - this.currentToken = this.input.charAt(++this.cursor); - return token; + this.current = this.sourceString.charCodeAt(this.cursor); + return true; + } + + currentAsString(): string { + return String.fromCharCode(this.current); + } + + more(): bool { + return this.cursor < u32(this.sourceString.length); + } + + copy(): StringIterator { + const iterator = new StringIterator(this.sourceString); + iterator.cursor = this.cursor; + iterator.current = this.current; + return iterator; } +} + +export class Parser { + // currentToken: string = ""; + // cursor: u32 = 0; + iterator: StringIterator; - private more(): bool { - return this.currentToken.length > 0; + private constructor(input: string) { + this.iterator = new StringIterator(input); + } + + static toAST(input: string): AST { + return new Parser(input).toAST(); } - private resetCursor(): void { - this.cursor = 0; - this.currentToken = this.input.charAt(0); + private eatToken(value: u32 = -1): u32 { + const currentToken = this.iterator.current; + if (value != -1 && this.iterator.current != value) { + throw new Error("invalid token"); + } + this.iterator.next(); + return currentToken; } private toAST(): AST { - this.resetCursor(); return new AST(this.parseSequence()); } + private currentCharCode(): u32 { + return this.iterator.current; + } + private parseCharacter(): Node { - let token = this.currentToken.charCodeAt(0); + let token = this.iterator.current; if (token == Char.Backslash) { this.eatToken(Char.Backslash); - token = this.currentToken.charCodeAt(0); + token = this.iterator.current; if (isSpecialCharacter(token)) { this.eatToken(); return new CharacterNode(token); @@ -120,20 +154,20 @@ export class Parser { private maybeParseRepetitionRange(): Range { // snapshot - const previousCursor = this.cursor; + const iteratorCopy = this.iterator.copy(); this.eatToken(Char.LeftCurlyBrace); let range = new Range(); let firstDigit = true; let digitStr = ""; - while (this.more()) { - const token = this.currentToken.charCodeAt(0); + while (this.iterator.more()) { + const token = this.iterator.current; if (token == Char.RightParenthesis) break; if (firstDigit) { if (isDigit(token)) { // if it is a digit, keep eating - digitStr += this.currentToken; + digitStr += this.iterator.currentAsString(); } else { range.from = digitStr.length ? parseInt(digitStr) : -1; range.to = range.from; @@ -154,7 +188,7 @@ export class Parser { } else { if (isDigit(token)) { // if it is a digit, keep eating - digitStr += this.currentToken; + digitStr += this.iterator.currentAsString(); } else { range.to = digitStr.length ? parseInt(digitStr) : -1; if (token == Char.RightCurlyBrace) { @@ -171,8 +205,7 @@ export class Parser { } // repetition not found - reset state - this.cursor = previousCursor; - this.currentToken = this.input.charAt(previousCursor); + this.iterator = iteratorCopy; return range; } @@ -180,8 +213,8 @@ export class Parser { // parses a sequence of chars private parseSequence(): Node { let nodes = new Array(); - while (this.more()) { - const token = this.currentToken.charCodeAt(0); + while (this.iterator.more()) { + const token = this.iterator.current; if (token == Char.RightParenthesis) break; // @ts-ignore if (token == Char.VerticalBar) { @@ -227,34 +260,35 @@ export class Parser { private parseCharacterSet(): CharacterSetNode { this.eatToken(Char.LeftSquareBracket); - const token = this.currentToken.charCodeAt(0); - const negated = token == Char.Caret; + const negated = this.iterator.current == Char.Caret; if (negated) { this.eatToken(Char.Caret); } const nodes = new Array(); - while (this.currentToken != "]" || nodes.length == 0) { + while ( + this.iterator.current != Char.RightSquareBracket || + nodes.length == 0 + ) { // lookahead for character range if ( - this.cursor + 2 < u32(this.input.length) && - this.currentToken != "\\" && - this.input.charCodeAt(this.cursor + 1) == Char.Minus && - this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket + this.iterator.current != Char.Backslash && + this.iterator.lookahead(1) == Char.Minus && + this.iterator.lookahead(2) != Char.RightSquareBracket ) { nodes.push(this.parseCharacterRange()); } else { if ( - this.currentToken == "\\" && - isCharacterSetSpecialChar(this.input.charCodeAt(this.cursor + 1)) + this.iterator.current == Char.Backslash && + isCharacterSetSpecialChar(this.iterator.lookahead(1)) ) { this.eatToken(Char.Backslash); } nodes.push(new CharacterNode(this.eatToken())); } - if (this.cursor >= u32(this.input.length)) { + if (!this.iterator.more()) { throw new SyntaxError("Unterminated character class"); } } diff --git a/ts/index.ts b/ts/index.ts index 2d42465..f2ecd61 100644 --- a/ts/index.ts +++ b/ts/index.ts @@ -5,7 +5,7 @@ globalAny.log = console.log; import { RegExp } from "../assembly/regexp"; -const regexObj = new RegExp("[a\\\\c]"); -const match = regexObj.exec("\\"); +const regexObj = new RegExp("[abce]"); +const match = regexObj.exec("a"); console.log(match); From bc8bc1fb1bcf308a06c0b011d13f75b016a60b96 Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 15:57:57 +0000 Subject: [PATCH 6/7] string iterator into new file --- assembly/parser/parser.ts | 40 +----------------------------- assembly/parser/string-iterator.ts | 36 +++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 39 deletions(-) create mode 100644 assembly/parser/string-iterator.ts diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts index be8db87..a9d1017 100644 --- a/assembly/parser/parser.ts +++ b/assembly/parser/parser.ts @@ -1,4 +1,5 @@ import { isDigit, Char } from "../char"; +import { StringIterator } from "./string-iterator"; import { AST, RangeRepetitionNode, @@ -58,46 +59,7 @@ class Range { to: i32 = -1; } -class StringIterator { - current: u32; - cursor: u32 = 0; - - constructor(private sourceString: string) { - this.current = this.sourceString.charCodeAt(0); - } - - lookahead(distance: u32): u32 { - return this.sourceString.charCodeAt(this.cursor + distance); - } - - next(): bool { - this.cursor++; - if (this.cursor >= u32(this.sourceString.length)) { - return false; - } - this.current = this.sourceString.charCodeAt(this.cursor); - return true; - } - - currentAsString(): string { - return String.fromCharCode(this.current); - } - - more(): bool { - return this.cursor < u32(this.sourceString.length); - } - - copy(): StringIterator { - const iterator = new StringIterator(this.sourceString); - iterator.cursor = this.cursor; - iterator.current = this.current; - return iterator; - } -} - export class Parser { - // currentToken: string = ""; - // cursor: u32 = 0; iterator: StringIterator; private constructor(input: string) { diff --git a/assembly/parser/string-iterator.ts b/assembly/parser/string-iterator.ts new file mode 100644 index 0000000..fbd49b7 --- /dev/null +++ b/assembly/parser/string-iterator.ts @@ -0,0 +1,36 @@ +export class StringIterator { + current: u32; + cursor: u32 = 0; + + constructor(private sourceString: string) { + this.current = this.sourceString.charCodeAt(0); + } + + lookahead(distance: u32): u32 { + return this.sourceString.charCodeAt(this.cursor + distance); + } + + next(): bool { + this.cursor++; + if (this.cursor >= u32(this.sourceString.length)) { + return false; + } + this.current = this.sourceString.charCodeAt(this.cursor); + return true; + } + + currentAsString(): string { + return String.fromCharCode(this.current); + } + + more(): bool { + return this.cursor < u32(this.sourceString.length); + } + + copy(): StringIterator { + const iterator = new StringIterator(this.sourceString); + iterator.cursor = this.cursor; + iterator.current = this.current; + return iterator; + } +} From 267b948c61d3c6e599fd058903b56d8f7f1dc9f5 Mon Sep 17 00:00:00 2001 From: Colin E Date: Mon, 25 Jan 2021 16:03:36 +0000 Subject: [PATCH 7/7] updated to use NodeType --- assembly/nfa/matcher.ts | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts index 9c32f0c..93a4067 100644 --- a/assembly/nfa/matcher.ts +++ b/assembly/nfa/matcher.ts @@ -5,6 +5,8 @@ import { CharacterSetNode, CharacterClassNode, CharacterRangeNode, + NodeType, + Node, } from "../parser/node"; import { Match } from "../regexp"; @@ -27,12 +29,13 @@ export class Matcher { static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher { const matchers = node.expressions.map((exp) => { - if (CharacterRangeNode.is(exp)) { - return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode); - } else if (CharacterNode.is(exp)) { - return Matcher.fromCharacterNode(exp as CharacterNode); - } else { - throw new Error("unsupported node type within character set"); + switch (exp.type) { + case NodeType.CharacterRange: + return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode); + case NodeType.Character: + return Matcher.fromCharacterNode(exp as CharacterNode); + default: + throw new Error("unsupported node type within character set"); } }); return new CharacterSetMatcher(matchers, node.negated);