From 93610ce779714f740c7ec03c225f6a3c1321c26c Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Sun, 24 Jan 2021 20:44:03 +0000
Subject: [PATCH 1/7] Move character set parsing logic into the parser

---
 assembly/char.ts          |  3 +-
 assembly/nfa/matcher.ts   | 67 +++++++++++++++++++++++++++------------
 assembly/parser/node.ts   | 22 +++++++++++--
 assembly/parser/parser.ts | 40 ++++++++++++++++-------
 assembly/regexp.ts        | 36 ++++++++++++++++++++-
 ts/index.ts               |  4 +--
 6 files changed, 133 insertions(+), 39 deletions(-)
diff --git a/assembly/char.ts b/assembly/char.ts
index 9505081..c6f6a65 100644
--- a/assembly/char.ts
+++ b/assembly/char.ts
@@ -9,8 +9,9 @@ export const enum Char {
   LeftParenthesis = 0x28,
   RightParenthesis = 0x29,
   Asterisk = 0x2a, // "*"
-  Comma = 0x2c, // "*"
   Plus = 0x2b, // "+"
+  Comma = 0x2c, // "*"
+  Minus = 0x2d, // "-"
   Dot = 0x2e, // "."
   Zero = 0x30,
   Question = 0x3f, // "?"
diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts
index c9be125..1c3b648 100644
--- a/assembly/nfa/matcher.ts
+++ b/assembly/nfa/matcher.ts
@@ -4,10 +4,14 @@ import {
   CharacterNode,
   CharacterSetNode,
   CharacterClassNode,
+  CharacterRangeNode,
 } from "../parser/node";
+import { Match } from "../regexp";
 
-export abstract class Matcher {
-  abstract matches(code: u32): bool;
+export class Matcher {
+  matches(code: u32): bool {
+    return false;
+  }
 
   static fromCharacterClassNode(
     node: CharacterClassNode
@@ -15,8 +19,27 @@ export abstract class Matcher {
     return new CharacterClassMatcher(node.charClass);
   }
 
+  static fromCharacterRangeNode(
+    node: CharacterRangeNode
+  ): CharacterRangeMatcher {
+    return new CharacterRangeMatcher(node.from, node.to);
+  }
+
   static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher {
-    return new CharacterSetMatcher(node.chars, node.negated);
+    const matchers = new Array<Matcher>();
+    for (let i = 0; i < node.expressions.length; i++) {
+      const exp = node.expressions[i];
+      if (CharacterRangeNode.is(exp)) {
+        matchers.push(
+          Matcher.fromCharacterRangeNode(exp as CharacterRangeNode)
+        );
+      } else if (CharacterNode.is(exp)) {
+        matchers.push(Matcher.fromCharacterNode(exp as CharacterNode));
+      } else {
+        throw new Error("unsupported node type within character set");
+      }
+    }
+    return new CharacterSetMatcher(matchers, node.negated);
   }
 
   static fromCharacterNode(node: CharacterNode): CharacterMatcher {
@@ -34,6 +57,16 @@ export class CharacterMatcher extends Matcher {
   }
 }
 
+export class CharacterRangeMatcher extends Matcher {
+  constructor(public from: u32, public to: u32) {
+    super();
+  }
+
+  matches(code: u32): bool {
+    return code >= this.from && code <= this.to;
+  }
+}
+
 export class CharacterClassMatcher extends Matcher {
   constructor(public charClass: Char) {
     super();
@@ -79,28 +112,20 @@ export class CharacterClassMatcher extends Matcher {
   }
 }
 
+// no closure support
+let _code: u32;
+
 export class CharacterSetMatcher extends Matcher {
-  constructor(public set: string, public negated: bool) {
+  constructor(public matchers: Matcher[], public negated: bool) {
     super();
   }
 
-  matchesSet(set: string, code: u32): bool {
-    for (let i = 0, len = set.length; i < len; i++) {
-      // TODO - perform the set parsing logic in the constructor?
-      // TODO - move into the parser?
-      if (i < len - 2 && set.charCodeAt(i + 1) == 45 /*-*/) {
-        const from = set.charCodeAt(i) as u32;
-        const to = set.charCodeAt(i + 2) as u32;
-        if (code >= from && code <= to) return true;
-      } else {
-        if (set.charCodeAt(i) == code) return true;
-      }
-    }
-    return false;
-  }
-
   matches(code: u32): bool {
-    const matches = this.matchesSet(this.set, code);
-    return this.negated ? !matches : matches;
+    _code = code;
+    if (!this.negated) {
+      return this.matchers.some((m) => m.matches(_code));
+    } else {
+      return !this.matchers.some((m) => m.matches(_code));
+    }
   }
 }
diff --git a/assembly/parser/node.ts b/assembly/parser/node.ts
index fce9d56..dbc1ec0 100644
--- a/assembly/parser/node.ts
+++ b/assembly/parser/node.ts
@@ -9,6 +9,7 @@ export const enum NodeType {
   Character,
   CharacterSet,
   CharacterClass,
+  CharacterRange,
   Repetition,
   RangeRepetition,
   Group,
@@ -71,12 +72,29 @@ export class ConcatenationNode extends Node {
 }
 
 export class CharacterSetNode extends Node {
-  constructor(public chars: string, public negated: bool) {
+  constructor(public expressions: Node[], public negated: bool) {
     super(NodeType.CharacterSet);
   }
 
   clone(): Node {
-    return new CharacterSetNode(this.chars, this.negated);
+    return new CharacterSetNode(
+      this.expressions.slice(0).map<Node>((s) => s.clone()),
+      this.negated
+    );
+  }
+}
+
+export class CharacterRangeNode extends Node {
+  constructor(public from: u32, public to: u32) {
+    super(NodeType.CharacterRange);
+  }
+
+  static is(node: Node): bool {
+    return node.type == NodeType.CharacterRange;
+  }
+
+  clone(): Node {
+    return new CharacterRangeNode(this.from, this.to);
   }
 }
 
diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts
index f0d5c43..43a9321 100644
--- a/assembly/parser/parser.ts
+++ b/assembly/parser/parser.ts
@@ -11,6 +11,7 @@ import {
   ConcatenationNode,
   RepetitionNode,
   CharacterSetNode,
+  CharacterRangeNode,
 } from "./node";
 
 function isQuantifier(code: Char): bool {
@@ -117,7 +118,7 @@ export class Parser {
     let firstDigit = true;
     let digitStr = "";
     while (this.more()) {
-      let token = this.currentToken.charCodeAt(0);
+      const token = this.currentToken.charCodeAt(0);
       if (token == Char.RightParenthesis) break;
       if (firstDigit) {
         if (isDigit(token)) {
@@ -170,7 +171,7 @@ export class Parser {
   private parseSequence(): Node {
     let nodes = new Array<Node>();
     while (this.more()) {
-      let token = this.currentToken.charCodeAt(0);
+      const token = this.currentToken.charCodeAt(0);
       if (token == Char.RightParenthesis) break;
       // @ts-ignore
       if (token == Char.VerticalBar) {
@@ -207,23 +208,38 @@ export class Parser {
     return nodes.length > 1 ? new ConcatenationNode(nodes) : nodes[0];
   }
 
+  private parseCharacterRange(): Node {
+    const from = this.eatToken();
+    this.eatToken(Char.Minus);
+    const to = this.eatToken();
+    return new CharacterRangeNode(from, to);
+  }
+
   private parseCharacterSet(): CharacterSetNode {
-    let chars = "";
     this.eatToken(Char.LeftSquareBracket);
-    const negated = this.currentToken == "^";
+    const token = this.currentToken.charCodeAt(0);
+
+    const negated = token == Char.Caret;
     if (negated) {
       this.eatToken(Char.Caret);
     }
-    while (
-      this.currentToken != "]" ||
-      (chars.length == 0 && this.currentToken == "]")
-    ) {
-      // TODO characters set can contain character classes
-      chars += this.currentToken;
-      this.eatToken();
+
+    const nodes = new Array<Node>();
+    while (this.currentToken != "]" || nodes.length == 0) {
+      // lookahead for character range
+      if (
+        this.cursor + 1 < u32(this.input.length) &&
+        this.input.charCodeAt(this.cursor + 1) == Char.Minus &&
+        this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket
+      ) {
+        nodes.push(this.parseCharacterRange());
+      } else {
+        nodes.push(this.parseCharacter());
+      }
+
       // TODO error if we run out of chars?
     }
     this.eatToken(Char.RightSquareBracket);
-    return new CharacterSetNode(chars, negated);
+    return new CharacterSetNode(nodes, negated);
   }
 }
diff --git a/assembly/regexp.ts b/assembly/regexp.ts
index 15ac55d..308fa75 100644
--- a/assembly/regexp.ts
+++ b/assembly/regexp.ts
@@ -1,10 +1,17 @@
-import { State, Automata, toNFAFromAST, GroupEndMarkerState } from "./nfa/nfa";
+import {
+  State,
+  Automata,
+  toNFAFromAST,
+  GroupEndMarkerState,
+  MatcherState,
+} from "./nfa/nfa";
 import { walker as nfaWalker } from "./nfa/walker";
 import { ConcatenationNode, AssertionNode, NodeType } from "./parser/node";
 import { Char } from "./char";
 import { Parser } from "./parser/parser";
 import { first, last } from "./util";
 import { walker as astWalker, expandRepetitions } from "./parser/walker";
+import { CharacterMatcher, CharacterSetMatcher, Matcher } from "./nfa/matcher";
 
 function recursiveBacktrackingSearch(
   state: State,
@@ -145,5 +152,32 @@ export class RegExp {
 // TODO: do we need this factory function, or can we invoke
 // the ctr via the loader?
 export function createRegExp(regex: string, flags: string): RegExp {
+
+  /* ---------------- */
+  /*
+    This block of code is needed to avoid the following runtime error ...
+
+    RuntimeError: unreachable
+        at assembly/nfa/matcher/Matcher#matches@virtual (wasm-function[240]:1)
+        at assembly/nfa/matcher/CharacterSetMatcher#matches~anonymous|0 (wasm-function[241]:19)
+        at ~lib/array/Array<assembly/nfa/matcher/Matcher>#some (wasm-function[242]:85)
+        at assembly/nfa/matcher/CharacterSetMatcher#matches (wasm-function[244]:21)
+        at assembly/nfa/nfa/MatcherState<assembly/nfa/matcher/CharacterSetMatcher>#matches (wasm-function[245]:8)
+        at assembly/nfa/nfa/State#matches@virtual (wasm-function[250]:58)
+        at assembly/regexp/recursiveBacktrackingSearch (wasm-function[184]:121)
+        at assembly/regexp/recursiveBacktrackingSearch@varargs (wasm-function[185]:56)
+        at assembly/regexp/RegExp#exec (wasm-function[192]:307)
+  */
+  const matchers = new Array<Matcher>();
+  matchers.push(new CharacterMatcher(Char.A));
+  const charMatcher = new CharacterSetMatcher(matchers, false);
+  const state = new MatcherState<CharacterSetMatcher>(
+    charMatcher,
+    new State(true)
+  );
+  const char = "a".charCodeAt(0);
+  const doesMatch = state.matches(char) != null;
+  /* ---------------- */
+
   return new RegExp(regex, flags);
 }
diff --git a/ts/index.ts b/ts/index.ts
index c14ab9d..fe02757 100644
--- a/ts/index.ts
+++ b/ts/index.ts
@@ -5,7 +5,7 @@ globalAny.log = console.log;
 
 import { RegExp } from "../assembly/regexp";
 
-const regexObj = new RegExp("(a*)+");
-const match = regexObj.exec("-");
+const regexObj = new RegExp("[]a]");
+const match = regexObj.exec("]");
 
 console.log(match);

From 744a721ce8aad0c7da960e54e0c98cc1bde1f10a Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 07:50:14 +0000
Subject: [PATCH 2/7] small refactor

---
 assembly/nfa/matcher.ts | 12 ++++--------
 assembly/regexp.ts      |  1 -
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts
index 1c3b648..9c32f0c 100644
--- a/assembly/nfa/matcher.ts
+++ b/assembly/nfa/matcher.ts
@@ -26,19 +26,15 @@ export class Matcher {
   }
 
   static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher {
-    const matchers = new Array<Matcher>();
-    for (let i = 0; i < node.expressions.length; i++) {
-      const exp = node.expressions[i];
+    const matchers = node.expressions.map<Matcher>((exp) => {
       if (CharacterRangeNode.is(exp)) {
-        matchers.push(
-          Matcher.fromCharacterRangeNode(exp as CharacterRangeNode)
-        );
+        return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode);
       } else if (CharacterNode.is(exp)) {
-        matchers.push(Matcher.fromCharacterNode(exp as CharacterNode));
+        return Matcher.fromCharacterNode(exp as CharacterNode);
       } else {
         throw new Error("unsupported node type within character set");
       }
-    }
+    });
     return new CharacterSetMatcher(matchers, node.negated);
   }
 
diff --git a/assembly/regexp.ts b/assembly/regexp.ts
index 308fa75..4c99f06 100644
--- a/assembly/regexp.ts
+++ b/assembly/regexp.ts
@@ -152,7 +152,6 @@ export class RegExp {
 // TODO: do we need this factory function, or can we invoke
 // the ctr via the loader?
 export function createRegExp(regex: string, flags: string): RegExp {
-
   /* ---------------- */
   /*
     This block of code is needed to avoid the following runtime error ...

From f9bc2f4dced9255865603a8d63e04b1163675f3a Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 11:19:36 +0000
Subject: [PATCH 3/7] test code refactor

---
 {test => __spec_tests__}/data.test.js |   4 +-
 {test => __spec_tests__}/test.dat     |   0
 __tests__/alternations.js             |  15 ++
 __tests__/boundary-assertions.js      |  19 ++
 __tests__/capture-groups.js           |  35 +++
 __tests__/character-classes.js        |  50 ++++
 __tests__/character-sets.js           |  41 ++++
 __tests__/characters.js               |  11 +
 __tests__/index.js                    |  71 ++++++
 __tests__/quantifiers.js              |  38 +++
 __tests__/range-quantifiers.js        |  27 +++
 {test => __tests__}/util.js           |  31 ++-
 package.json                          |   4 +-
 test/index.test.js                    | 337 --------------------------
 14 files changed, 340 insertions(+), 343 deletions(-)
 rename {test => __spec_tests__}/data.test.js (92%)
 rename {test => __spec_tests__}/test.dat (100%)
 create mode 100644 __tests__/alternations.js
 create mode 100644 __tests__/boundary-assertions.js
 create mode 100644 __tests__/capture-groups.js
 create mode 100644 __tests__/character-classes.js
 create mode 100644 __tests__/character-sets.js
 create mode 100644 __tests__/characters.js
 create mode 100644 __tests__/index.js
 create mode 100644 __tests__/quantifiers.js
 create mode 100644 __tests__/range-quantifiers.js
 rename {test => __tests__}/util.js (71%)
 delete mode 100644 test/index.test.js

diff --git a/test/data.test.js b/__spec_tests__/data.test.js
similarity index 92%
rename from test/data.test.js
rename to __spec_tests__/data.test.js
index 0267f97..acb680e 100644
--- a/test/data.test.js
+++ b/__spec_tests__/data.test.js
@@ -1,8 +1,8 @@
-const { RegExp } = require("./util");
+const { RegExp } = require("../__tests__/util");
 const fs = require("fs");
 const { fail } = require("assert");
 
-const data = fs.readFileSync("./test/test.dat", "utf8");
+const data = fs.readFileSync("./__spec_tests__/test.dat", "utf8");
 const lines = data.split("\n");
 
 const matches = (regex, value) => {
diff --git a/test/test.dat b/__spec_tests__/test.dat
similarity index 100%
rename from test/test.dat
rename to __spec_tests__/test.dat
diff --git a/__tests__/alternations.js b/__tests__/alternations.js
new file mode 100644
index 0000000..fe1fd62
--- /dev/null
+++ b/__tests__/alternations.js
@@ -0,0 +1,15 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("or", () => {
+  expectMatch("a|b", ["b", "a"]);
+  expectNotMatch("a|b", ["c"]);
+  expectMatch("a|br", ["br", "a"]);
+  expectNotMatch("a|br", ["b", "c"]);
+});
+
+it("or multi-term", () => {
+  expectMatch("a|b|c", ["b", "a", "c"]);
+  expectNotMatch("a|b|c", ["d"]);
+  expectMatch("a|br|pc", ["br", "a", "pc"]);
+  expectNotMatch("a|br|pc", ["b", "pr"]);
+});
diff --git a/__tests__/boundary-assertions.js b/__tests__/boundary-assertions.js
new file mode 100644
index 0000000..ccff481
--- /dev/null
+++ b/__tests__/boundary-assertions.js
@@ -0,0 +1,19 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("matches end of string", () => {
+  const regex = new RegExp("a$");
+  const match = regex.exec("ba");
+  expect(match.index).toEqual(1);
+  expect(match.matches[0]).toEqual("a");
+  expectNotMatch("a$", ["ab"]);
+});
+
+it("matches start of string", () => {
+  expectMatch("^a", ["a"]);
+  expectNotMatch("^a", ["ba"]);
+});
+
+it("handles escaped boundaries", () => {
+  expectMatch("\\^a", ["^a"]);
+  expectMatch("a\\$", ["a$"]);
+});
diff --git a/__tests__/capture-groups.js b/__tests__/capture-groups.js
new file mode 100644
index 0000000..91d57dd
--- /dev/null
+++ b/__tests__/capture-groups.js
@@ -0,0 +1,35 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("supports capture groups", () => {
+  let match = matches("a(\\d)a", "a3a");
+  expect(match.index).toEqual(0);
+  expect(match.input).toEqual("a3a");
+  expect(match.matches[0]).toEqual("a3a");
+  expect(match.matches[1]).toEqual("3");
+
+  match = matches("a(\\d)a", "  a3a");
+  expect(match.index).toEqual(2);
+  expect(match.input).toEqual("  a3a");
+  expect(match.matches[0]).toEqual("a3a");
+  expect(match.matches[1]).toEqual("3");
+
+  match = matches("a(\\d*)a", "a3456a");
+  expect(match.index).toEqual(0);
+  expect(match.input).toEqual("a3456a");
+  expect(match.matches[0]).toEqual("a3456a");
+  expect(match.matches[1]).toEqual("3456");
+
+  match = matches("a*(\\d*)(a*)", "aaa456aaa");
+  expect(match.index).toEqual(0);
+  expect(match.input).toEqual("aaa456aaa");
+  expect(match.matches[0]).toEqual("aaa456aaa");
+  expect(match.matches[1]).toEqual("456");
+  expect(match.matches[2]).toEqual("aaa");
+});
+
+it.skip("should not return captured values for non-matching alternations", () => {
+  const match = matches("(a|b)c|a(b|c)", "ab");
+  expect(match.matches[0]).toEqual("ab");
+  expect(match.matches[1]).toEqual("");
+  expect(match.matches[2]).toEqual("b");
+});
diff --git a/__tests__/character-classes.js b/__tests__/character-classes.js
new file mode 100644
index 0000000..21e88fe
--- /dev/null
+++ b/__tests__/character-classes.js
@@ -0,0 +1,50 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("dot", () => {
+  expectMatch(".", [" ", "B", "|", "9"]);
+  expectNotMatch(".", ["", "\n"]);
+});
+
+it("digit", () => {
+  expectMatch("\\d", ["0", "9"]);
+  expectNotMatch("\\d", ["", "b"]);
+});
+
+it("non-digit", () => {
+  expectNotMatch("\\D", ["0", "9", ""]);
+  expectMatch("\\D", ["b", "|"]);
+});
+
+it("word", () => {
+  expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]);
+  expectNotMatch("\\w", ["", "$"]);
+});
+
+it("not word", () => {
+  expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]);
+  expectMatch("\\W", ["&", "$"]);
+});
+
+it("whitespace", () => {
+  expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]);
+  expectNotMatch("\\s", ["", "a", "0"]);
+});
+
+it("not whitespace", () => {
+  expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]);
+  expectMatch("\\S", ["a", "0"]);
+});
+
+it("tab, cr, lf, vt, ff", () => {
+  expectMatch("\\t", ["\t"]);
+  expectMatch("\\r", ["\r"]);
+  expectMatch("\\n", ["\n"]);
+  expectMatch("\\v", ["\v"]);
+  expectMatch("\\f", ["\f"]);
+  expectNotMatch("\\t", ["a", " ", ""]);
+});
+
+it("escaped dot", () => {
+  expectMatch("\\.", ["."]);
+  expectNotMatch("\\.", ["", "a"]);
+});
diff --git a/__tests__/character-sets.js b/__tests__/character-sets.js
new file mode 100644
index 0000000..74384be
--- /dev/null
+++ b/__tests__/character-sets.js
@@ -0,0 +1,41 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("matches discrete characters", () => {
+  expectMatch("[abce]", ["a", "b", "c", "e"]);
+  expectNotMatch("[abce]", ["", "f", "h"]);
+});
+
+it("matches character ranges", () => {
+  expectMatch("[a-c]", ["a", "b", "c"]);
+  expectNotMatch("[a-c]", ["d", "e", ""]);
+  expectMatch("[K-M]", ["K", "L", "M"]);
+  expectNotMatch("[K-M]", ["9", "J"]);
+  expectMatch("[0-9]", ["0", "9"]);
+  expectNotMatch("[0-9]", ["a", "A"]);
+});
+
+it("matches multiple ranges", () => {
+  expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]);
+  expectNotMatch("[a-ce-f]", ["d"]);
+});
+
+it("supports closing brackets", () => {
+  expectMatch("[]a]", ["]", "a"]);
+});
+
+it("supports negated sets", () => {
+  expectNotMatch("[^a-c]", ["a", "b", "c"]);
+  expectMatch("[^a-c]", ["d", "e"]);
+  expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]);
+  expectMatch("[^a-ce-f]", ["d"]);
+});
+
+it("treats - as a literal", () => {
+  expectMatch("[-abc]", ["-", "a", "b", "c"]);
+  expectMatch("[abc-]", ["-", "a", "b", "c"]);
+});
+
+it("treats - as a literal in negated sets", () => {
+  expectNotMatch("[^-abc]", ["-", "a", "b", "c"]);
+  expectMatch("[^-abc]", ["1", "A"]);
+});
diff --git a/__tests__/characters.js b/__tests__/characters.js
new file mode 100644
index 0000000..28e37f5
--- /dev/null
+++ b/__tests__/characters.js
@@ -0,0 +1,11 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("single character", () => {
+  expectMatch("a", ["a"]);
+  expectNotMatch("a", ["fish", ""]);
+});
+
+it("concatenation", () => {
+  expectMatch("ab", ["ab"]);
+  expectNotMatch("ab", ["aac", "aa", ""]);
+});
diff --git a/__tests__/index.js b/__tests__/index.js
new file mode 100644
index 0000000..ea9a55c
--- /dev/null
+++ b/__tests__/index.js
@@ -0,0 +1,71 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+describe("regexp", () => {
+  it("match returns correct substring", () => {
+    const match = matches("\\d", "asd123asd");
+    expect(match.index).toEqual(3);
+    expect(match.input).toEqual("asd123asd");
+    expect(match.matches[0]).toEqual("1");
+  });
+
+  describe("global mode", () => {
+    it("increments lastIndex", () => {
+      const regex = new RegExp("\\d+", "g");
+      const match = regex.exec("dog 23 fish 45 cat");
+      expect(match.matches[0]).toEqual("23");
+      expect(regex.lastIndex).toEqual(6);
+    });
+
+    it("uses lastIndex to support multiple matches", () => {
+      const regex = new RegExp("\\d+", "g");
+
+      let match = regex.exec("dog 23 fish 45 cat");
+      expect(match.matches[0]).toEqual("23");
+      expect(regex.lastIndex).toEqual(6);
+
+      match = regex.exec("dog 23 fish 45 cat");
+      expect(match.matches[0]).toEqual("45");
+      expect(regex.lastIndex).toEqual(14);
+
+      match = regex.exec("dog 23 fish 45 cat");
+      expect(match).toBeNull();
+      expect(regex.lastIndex).toEqual(0);
+    });
+  });
+
+  describe("non-global mode", () => {
+    it("doesn't increment lastIndex", () => {
+      const regex = new RegExp("\\d+");
+
+      let match = regex.exec("dog 23 fish 45 cat");
+      expect(match.matches[0]).toEqual("23");
+      expect(regex.lastIndex).toEqual(0);
+
+      match = regex.exec("dog 23 fish 45 cat");
+      expect(match.matches[0]).toEqual("23");
+      expect(regex.lastIndex).toEqual(0);
+    });
+  });
+});
+
+describe("use cases", () => {
+  it("matches combinations", () => {
+    expectMatch("\\s\\w*", [" bar"]);
+    expectMatch("\\S\\w*", ["foo"]);
+  });
+
+  it("email", () => {
+    const regex = ".+@.+\\..+";
+    expect(matches(regex, "colin@gmail.com")).toBeTruthy();
+    expect(matches(regex, "gmail")).toBeFalsy();
+
+    const capturingRegex = "(.+)@(.+)\\.(.+)";
+    expect(matches(capturingRegex, "colin@gmail.com")).toBeTruthy();
+
+    match = matches(capturingRegex, "colin@gmail.com");
+    expect(match.matches[0]).toEqual("colin@gmail.com");
+    expect(match.matches[1]).toEqual("colin");
+    expect(match.matches[2]).toEqual("gmail");
+    expect(match.matches[3]).toEqual("com");
+  });
+});
diff --git a/__tests__/quantifiers.js b/__tests__/quantifiers.js
new file mode 100644
index 0000000..c5d5cf4
--- /dev/null
+++ b/__tests__/quantifiers.js
@@ -0,0 +1,38 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("matches empty strings", () => {
+  expectMatch("a?", [""]);
+  expectMatch("a*", [""]);
+});
+
+it("zero or one", () => {
+  expectMatch("a?", ["a"]);
+  // expectNotMatch("a?", ["bc"]);
+});
+
+it("one or more", () => {
+  expectMatch("a+", ["a", "aa"]);
+  expectNotMatch("a+", [""]);
+});
+
+it("zero or more", () => {
+  expectMatch("a*", ["aa", "aaaa"]);
+});
+
+it("multiple rules", () => {
+  expectMatch("a*b", ["b", "ab", "aaaab"]);
+  expectNotMatch("a*b", ["aaaad"]);
+});
+
+it("zero or more is greedy", () => {
+  let match = matches("a*", "aaaaa");
+  expect(match).not.toBeNull();
+  expect(match.matches[0]).toEqual("aaaaa");
+});
+
+it("one or more is greedy", () => {
+  let match = matches("a+", "aaaaa");
+  console.log(match);
+  expect(match).not.toBeNull();
+  expect(match.matches[0]).toEqual("aaaaa");
+});
diff --git a/__tests__/range-quantifiers.js b/__tests__/range-quantifiers.js
new file mode 100644
index 0000000..539122d
--- /dev/null
+++ b/__tests__/range-quantifiers.js
@@ -0,0 +1,27 @@
+const { RegExp, expectNotMatch, expectMatch, matches } = require("./util");
+
+it("handles single quantifier", () => {
+  expectMatch("a{2}", ["aa"]);
+  expectMatch("ba{2}", ["baa"]);
+  expectMatch("ba{1}b", ["bab"]);
+});
+
+it("handles open upper bound quantifiers", () => {
+  expectMatch("a{2,}", ["aa", "aaaaa"]);
+  expectMatch("ba{2,}", ["baa", "baaaaaaa"]);
+  expectMatch("ba{1,}b", ["bab", "baaaaaab"]);
+});
+
+it("handles explicit upper bound quantifiers", () => {
+  const match = matches("a{2,4}", "aaaaaaaaaa");
+  expect(match.matches[0]).toEqual("aaaa");
+});
+
+it("handles zero value quantifier", () => {
+  expectMatch("ba{0}b", ["bb"]);
+});
+
+it("handles quantifiers within alternates", () => {
+  expectMatch("a{2}|b{2}", ["bb", "aa"]);
+  expectNotMatch("a{2}|b{2}", ["cc"]);
+});
diff --git a/test/util.js b/__tests__/util.js
similarity index 71%
rename from test/util.js
rename to __tests__/util.js
index bf6b154..a7a086a 100644
--- a/test/util.js
+++ b/__tests__/util.js
@@ -2,7 +2,7 @@ global.TextDecoder = require("text-encoding").TextDecoder;
 const fs = require("fs");
 const loader = require("@assemblyscript/loader/umd/index");
 
-class RegExpProxy {
+class RegExp {
   constructor(regex, flags = "") {
     this.wasmModule = loader.instantiateSync(
       fs.readFileSync("./build/untouched.wasm"),
@@ -80,4 +80,31 @@ class RegExpProxy {
   }
 }
 
-module.exports.RegExp = RegExpProxy;
+const expectMatch = (regex, arr) => {
+  arr.forEach((value) => {
+    const regexp = new RegExp(regex);
+    const match = regexp.exec(value);
+    expect(match).not.toBeNull();
+    expect(match.matches[0]).toEqual(value);
+  });
+};
+
+const expectNotMatch = (regex, arr) => {
+  arr.forEach((value) => {
+    const regexp = new RegExp(regex);
+    const match = regexp.exec(value);
+    expect(match).toBeNull();
+  });
+};
+
+const matches = (regex, value) => {
+  const regexp = new RegExp(regex);
+  return regexp.exec(value);
+};
+
+test.todo("no tests in this file!");
+
+module.exports.RegExp = RegExp;
+module.exports.matches = matches;
+module.exports.expectNotMatch = expectNotMatch;
+module.exports.expectMatch = expectMatch;
diff --git a/package.json b/package.json
index 7e5a3e7..1ade2c1 100644
--- a/package.json
+++ b/package.json
@@ -4,8 +4,8 @@
   "description": "A regex engine built with AssemblyScript",
   "main": "index.js",
   "scripts": {
-    "test": "npm run asbuild:untouched && npm run prettier:check && jest test/index.test.js",
-    "test:suite": "npm run asbuild:untouched && jest test/data.test.js --reporter=jest-summary-reporter",
+    "test": "npm run asbuild:untouched && npm run prettier:check && jest __tests__",
+    "test:suite": "npm run asbuild:untouched && jest __spec_tests__ --reporter=jest-summary-reporter",
     "prettier:check": "prettier --check .",
     "prettier:write": "prettier --write .",
     "asbuild:untouched": "asc assembly/index.ts --target debug",
diff --git a/test/index.test.js b/test/index.test.js
deleted file mode 100644
index 4f0ebcc..0000000
--- a/test/index.test.js
+++ /dev/null
@@ -1,337 +0,0 @@
-const { RegExp } = require("./util");
-
-const expectMatch = (regex, arr) => {
-  arr.forEach((value) => {
-    const regexp = new RegExp(regex);
-    const match = regexp.exec(value);
-    expect(match).not.toBeNull();
-    expect(match.matches[0]).toEqual(value);
-  });
-};
-
-const expectNotMatch = (regex, arr) => {
-  arr.forEach((value) => {
-    const regexp = new RegExp(regex);
-    const match = regexp.exec(value);
-    expect(match).toBeNull();
-  });
-};
-
-const matches = (regex, value) => {
-  const regexp = new RegExp(regex);
-  return regexp.exec(value);
-};
-
-describe("Characters", () => {
-  it("single character", () => {
-    expectMatch("a", ["a"]);
-    expectNotMatch("a", ["fish", ""]);
-  });
-
-  it("concatenation", () => {
-    expectMatch("ab", ["ab"]);
-    expectNotMatch("ab", ["aac", "aa", ""]);
-  });
-});
-
-describe("Quantifiers", () => {
-  it("matches empty strings", () => {
-    expectMatch("a?", [""]);
-    expectMatch("a*", [""]);
-  });
-
-  it("zero or one", () => {
-    expectMatch("a?", ["a"]);
-    // expectNotMatch("a?", ["bc"]);
-  });
-
-  it("one or more", () => {
-    expectMatch("a+", ["a", "aa"]);
-    expectNotMatch("a+", [""]);
-  });
-
-  it("zero or more", () => {
-    expectMatch("a*", ["aa", "aaaa"]);
-  });
-
-  it("multiple rules", () => {
-    expectMatch("a*b", ["b", "ab", "aaaab"]);
-    expectNotMatch("a*b", ["aaaad"]);
-  });
-
-  it("zero or more is greedy", () => {
-    let match = matches("a*", "aaaaa");
-    expect(match).not.toBeNull();
-    expect(match.matches[0]).toEqual("aaaaa");
-  });
-
-  it("one or more is greedy", () => {
-    let match = matches("a+", "aaaaa");
-    console.log(match);
-    expect(match).not.toBeNull();
-    expect(match.matches[0]).toEqual("aaaaa");
-  });
-});
-
-describe("Groups and ranges", () => {
-  it("or", () => {
-    expectMatch("a|b", ["b", "a"]);
-    expectNotMatch("a|b", ["c"]);
-    expectMatch("a|br", ["br", "a"]);
-    expectNotMatch("a|br", ["b", "c"]);
-  });
-
-  it("or multi-term", () => {
-    expectMatch("a|b|c", ["b", "a", "c"]);
-    expectNotMatch("a|b|c", ["d"]);
-    expectMatch("a|br|pc", ["br", "a", "pc"]);
-    expectNotMatch("a|br|pc", ["b", "pr"]);
-  });
-});
-
-describe("character sets", () => {
-  it("matches discrete characters", () => {
-    expectMatch("[abce]", ["a", "b", "c", "e"]);
-    expectNotMatch("[abce]", ["", "f", "h"]);
-  });
-
-  it("matches character ranges", () => {
-    expectMatch("[a-c]", ["a", "b", "c"]);
-    expectNotMatch("[a-c]", ["d", "e", ""]);
-    expectMatch("[K-M]", ["K", "L", "M"]);
-    expectNotMatch("[K-M]", ["9", "J"]);
-    expectMatch("[0-9]", ["0", "9"]);
-    expectNotMatch("[0-9]", ["a", "A"]);
-  });
-
-  it("matches multiple ranges", () => {
-    expectMatch("[a-ce-f]", ["a", "b", "c", "e", "f"]);
-    expectNotMatch("[a-ce-f]", ["d"]);
-  });
-
-  it("supports closing brackets", () => {
-    expectMatch("[]a]", ["]", "a"]);
-  });
-
-  it("supports negated sets", () => {
-    expectNotMatch("[^a-c]", ["a", "b", "c"]);
-    expectMatch("[^a-c]", ["d", "e"]);
-    expectNotMatch("[^a-ce-f]", ["a", "b", "c", "e", "f"]);
-    expectMatch("[^a-ce-f]", ["d"]);
-  });
-
-  it("treats - as a literal", () => {
-    expectMatch("[-abc]", ["-", "a", "b", "c"]);
-    expectMatch("[abc-]", ["-", "a", "b", "c"]);
-  });
-
-  it("treats - as a literal in negated sets", () => {
-    expectNotMatch("[^-abc]", ["-", "a", "b", "c"]);
-    expectMatch("[^-abc]", ["1", "A"]);
-  });
-});
-
-describe("character classes", () => {
-  it("dot", () => {
-    expectMatch(".", [" ", "B", "|", "9"]);
-    expectNotMatch(".", ["", "\n"]);
-  });
-
-  it("digit", () => {
-    expectMatch("\\d", ["0", "9"]);
-    expectNotMatch("\\d", ["", "b"]);
-  });
-
-  it("non-digit", () => {
-    expectNotMatch("\\D", ["0", "9", ""]);
-    expectMatch("\\D", ["b", "|"]);
-  });
-
-  it("word", () => {
-    expectMatch("\\w", ["A", "a", "Z", "z", "0", "9", "_"]);
-    expectNotMatch("\\w", ["", "$"]);
-  });
-
-  it("not word", () => {
-    expectNotMatch("\\W", ["A", "a", "Z", "z", "0", "9", "_", ""]);
-    expectMatch("\\W", ["&", "$"]);
-  });
-
-  it("whitespace", () => {
-    expectMatch("\\s", ["\f", "\n", "\r", "\t", "\v"]);
-    expectNotMatch("\\s", ["", "a", "0"]);
-  });
-
-  it("not whitespace", () => {
-    expectNotMatch("\\S", ["", "\f", "\n", "\r", "\t", "\v"]);
-    expectMatch("\\S", ["a", "0"]);
-  });
-
-  it("tab, cr, lf, vt, ff", () => {
-    expectMatch("\\t", ["\t"]);
-    expectMatch("\\r", ["\r"]);
-    expectMatch("\\n", ["\n"]);
-    expectMatch("\\v", ["\v"]);
-    expectMatch("\\f", ["\f"]);
-    expectNotMatch("\\t", ["a", " ", ""]);
-  });
-
-  it("escaped dot", () => {
-    expectMatch("\\.", ["."]);
-    expectNotMatch("\\.", ["", "a"]);
-  });
-});
-
-describe("boundary assertions", () => {
-  it("matches end of string", () => {
-    const regex = new RegExp("a$");
-    const match = regex.exec("ba");
-    expect(match.index).toEqual(1);
-    expect(match.matches[0]).toEqual("a");
-    expectNotMatch("a$", ["ab"]);
-  });
-
-  it("matches start of string", () => {
-    expectMatch("^a", ["a"]);
-    expectNotMatch("^a", ["ba"]);
-  });
-
-  it("handles escaped boundaries", () => {
-    expectMatch("\\^a", ["^a"]);
-    expectMatch("a\\$", ["a$"]);
-  });
-});
-
-describe("regexp", () => {
-  it("match returns correct substring", () => {
-    const match = matches("\\d", "asd123asd");
-    expect(match.index).toEqual(3);
-    expect(match.input).toEqual("asd123asd");
-    expect(match.matches[0]).toEqual("1");
-  });
-
-  describe("global mode", () => {
-    it("increments lastIndex", () => {
-      const regex = new RegExp("\\d+", "g");
-      const match = regex.exec("dog 23 fish 45 cat");
-      expect(match.matches[0]).toEqual("23");
-      expect(regex.lastIndex).toEqual(6);
-    });
-
-    it("uses lastIndex to support multiple matches", () => {
-      const regex = new RegExp("\\d+", "g");
-
-      let match = regex.exec("dog 23 fish 45 cat");
-      expect(match.matches[0]).toEqual("23");
-      expect(regex.lastIndex).toEqual(6);
-
-      match = regex.exec("dog 23 fish 45 cat");
-      expect(match.matches[0]).toEqual("45");
-      expect(regex.lastIndex).toEqual(14);
-
-      match = regex.exec("dog 23 fish 45 cat");
-      expect(match).toBeNull();
-      expect(regex.lastIndex).toEqual(0);
-    });
-  });
-
-  describe("non-global mode", () => {
-    it("doesn't increment lastIndex", () => {
-      const regex = new RegExp("\\d+");
-
-      let match = regex.exec("dog 23 fish 45 cat");
-      expect(match.matches[0]).toEqual("23");
-      expect(regex.lastIndex).toEqual(0);
-
-      match = regex.exec("dog 23 fish 45 cat");
-      expect(match.matches[0]).toEqual("23");
-      expect(regex.lastIndex).toEqual(0);
-    });
-  });
-});
-
-describe("capture groups", () => {
-  it("supports capture groups", () => {
-    let match = matches("a(\\d)a", "a3a");
-    expect(match.index).toEqual(0);
-    expect(match.input).toEqual("a3a");
-    expect(match.matches[0]).toEqual("a3a");
-    expect(match.matches[1]).toEqual("3");
-
-    match = matches("a(\\d)a", "  a3a");
-    expect(match.index).toEqual(2);
-    expect(match.input).toEqual("  a3a");
-    expect(match.matches[0]).toEqual("a3a");
-    expect(match.matches[1]).toEqual("3");
-
-    match = matches("a(\\d*)a", "a3456a");
-    expect(match.index).toEqual(0);
-    expect(match.input).toEqual("a3456a");
-    expect(match.matches[0]).toEqual("a3456a");
-    expect(match.matches[1]).toEqual("3456");
-
-    match = matches("a*(\\d*)(a*)", "aaa456aaa");
-    expect(match.index).toEqual(0);
-    expect(match.input).toEqual("aaa456aaa");
-    expect(match.matches[0]).toEqual("aaa456aaa");
-    expect(match.matches[1]).toEqual("456");
-    expect(match.matches[2]).toEqual("aaa");
-  });
-
-  it.skip("should not return captured values for non-matching alternations", () => {
-    const match = matches("(a|b)c|a(b|c)", "ab");
-    expect(match.matches[0]).toEqual("ab");
-    expect(match.matches[1]).toEqual("");
-    expect(match.matches[2]).toEqual("b");
-  });
-});
-
-describe("range quantifiers", () => {
-  it("handles single quantifier", () => {
-    expectMatch("a{2}", ["aa"]);
-    expectMatch("ba{2}", ["baa"]);
-    expectMatch("ba{1}b", ["bab"]);
-  });
-
-  it("handles open upper bound quantifiers", () => {
-    expectMatch("a{2,}", ["aa", "aaaaa"]);
-    expectMatch("ba{2,}", ["baa", "baaaaaaa"]);
-    expectMatch("ba{1,}b", ["bab", "baaaaaab"]);
-  });
-
-  it("handles explicit upper bound quantifiers", () => {
-    const match = matches("a{2,4}", "aaaaaaaaaa");
-    expect(match.matches[0]).toEqual("aaaa");
-  });
-
-  it("handles zero value quantifier", () => {
-    expectMatch("ba{0}b", ["bb"]);
-  });
-
-  it("handles quantifiers within alternates", () => {
-    expectMatch("a{2}|b{2}", ["bb", "aa"]);
-    expectNotMatch("a{2}|b{2}", ["cc"]);
-  });
-});
-
-describe("use cases", () => {
-  it("matches combinations", () => {
-    expectMatch("\\s\\w*", [" bar"]);
-    expectMatch("\\S\\w*", ["foo"]);
-  });
-
-  it("email", () => {
-    const regex = ".+@.+\\..+";
-    expect(matches(regex, "colin@gmail.com")).toBeTruthy();
-    expect(matches(regex, "gmail")).toBeFalsy();
-
-    const capturingRegex = "(.+)@(.+)\\.(.+)";
-    expect(matches(capturingRegex, "colin@gmail.com")).toBeTruthy();
-
-    match = matches(capturingRegex, "colin@gmail.com");
-    expect(match.matches[0]).toEqual("colin@gmail.com");
-    expect(match.matches[1]).toEqual("colin");
-    expect(match.matches[2]).toEqual("gmail");
-    expect(match.matches[3]).toEqual("com");
-  });
-});

From 356ed85ed83e2edfe7e4d190672c67c7bb1b9e6c Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 14:16:19 +0000
Subject: [PATCH 4/7] character sets support escaping of special chars

---
 __tests__/character-sets.js | 11 +++++++++++
 __tests__/quantifiers.js    |  1 -
 assembly/parser/parser.ts   | 25 ++++++++++++++++++++++---
 package.json                |  1 +
 ts/index.ts                 |  4 ++--
 5 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/__tests__/character-sets.js b/__tests__/character-sets.js
index 74384be..44b16a6 100644
--- a/__tests__/character-sets.js
+++ b/__tests__/character-sets.js
@@ -5,6 +5,17 @@ it("matches discrete characters", () => {
   expectNotMatch("[abce]", ["", "f", "h"]);
 });
 
+it("throws an error if no closing bracket is found", () => {
+  expect(() => new RegExp("[abce")).toThrow();
+});
+
+it("supports escaping of special characters", () => {
+  expectMatch("[a\\^b]", ["a", "b", "^"]);
+  expectMatch("[a\\-c]", ["a", "c", "-"]);
+  expectMatch("[a\\]]", ["a", "]"]);
+  expectMatch("[a\\\\b]", ["a", "\\"]);
+});
+
 it("matches character ranges", () => {
   expectMatch("[a-c]", ["a", "b", "c"]);
   expectNotMatch("[a-c]", ["d", "e", ""]);
diff --git a/__tests__/quantifiers.js b/__tests__/quantifiers.js
index c5d5cf4..8c43686 100644
--- a/__tests__/quantifiers.js
+++ b/__tests__/quantifiers.js
@@ -32,7 +32,6 @@ it("zero or more is greedy", () => {
 
 it("one or more is greedy", () => {
   let match = matches("a+", "aaaaa");
-  console.log(match);
   expect(match).not.toBeNull();
   expect(match.matches[0]).toEqual("aaaaa");
 });
diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts
index 43a9321..3c848d6 100644
--- a/assembly/parser/parser.ts
+++ b/assembly/parser/parser.ts
@@ -18,6 +18,16 @@ function isQuantifier(code: Char): bool {
   return code == Char.Question || code == Char.Plus || code == Char.Asterisk;
 }
 
+// characters which have special meaning within character sets
+function isCharacterSetSpecialChar(code: Char): bool {
+  return (
+    code == Char.Caret ||
+    code == Char.Minus ||
+    code == Char.RightSquareBracket ||
+    code == Char.Backslash
+  );
+}
+
 function isAssertion(code: u32): bool {
   return code == Char.Dollar || code == Char.Caret; // "$" or "^"
 }
@@ -228,16 +238,25 @@ export class Parser {
     while (this.currentToken != "]" || nodes.length == 0) {
       // lookahead for character range
       if (
-        this.cursor + 1 < u32(this.input.length) &&
+        this.cursor + 2 < u32(this.input.length) &&
+        this.currentToken != "\\" &&
         this.input.charCodeAt(this.cursor + 1) == Char.Minus &&
         this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket
       ) {
         nodes.push(this.parseCharacterRange());
       } else {
-        nodes.push(this.parseCharacter());
+        if (
+          this.currentToken == "\\" &&
+          isCharacterSetSpecialChar(this.input.charCodeAt(this.cursor + 1))
+        ) {
+          this.eatToken(Char.Backslash);
+        }
+        nodes.push(new CharacterNode(this.eatToken()));
       }
 
-      // TODO error if we run out of chars?
+      if (this.cursor >= u32(this.input.length)) {
+        throw new SyntaxError("Unterminated character class");
+      }
     }
     this.eatToken(Char.RightSquareBracket);
     return new CharacterSetNode(nodes, negated);
diff --git a/package.json b/package.json
index 1ade2c1..59183b9 100644
--- a/package.json
+++ b/package.json
@@ -6,6 +6,7 @@
   "scripts": {
     "test": "npm run asbuild:untouched && npm run prettier:check && jest __tests__",
     "test:suite": "npm run asbuild:untouched && jest __spec_tests__ --reporter=jest-summary-reporter",
+    "jest": "jest __tests__",
     "prettier:check": "prettier --check .",
     "prettier:write": "prettier --write .",
     "asbuild:untouched": "asc assembly/index.ts --target debug",
diff --git a/ts/index.ts b/ts/index.ts
index fe02757..2d42465 100644
--- a/ts/index.ts
+++ b/ts/index.ts
@@ -5,7 +5,7 @@ globalAny.log = console.log;
 
 import { RegExp } from "../assembly/regexp";
 
-const regexObj = new RegExp("[]a]");
-const match = regexObj.exec("]");
+const regexObj = new RegExp("[a\\\\c]");
+const match = regexObj.exec("\\");
 
 console.log(match);

From 417f7834542d632e4249272c2de755b9d86ccaf4 Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 15:44:10 +0000
Subject: [PATCH 5/7] Added string iterator concept to parser

---
 assembly/parser/parser.ts | 110 +++++++++++++++++++++++++-------------
 ts/index.ts               |   4 +-
 2 files changed, 74 insertions(+), 40 deletions(-)

diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts
index 3c848d6..be8db87 100644
--- a/assembly/parser/parser.ts
+++ b/assembly/parser/parser.ts
@@ -58,44 +58,78 @@ class Range {
   to: i32 = -1;
 }
 
-export class Parser {
-  currentToken: string = "";
+class StringIterator {
+  current: u32;
   cursor: u32 = 0;
 
-  private constructor(public input: string) {}
+  constructor(private sourceString: string) {
+    this.current = this.sourceString.charCodeAt(0);
+  }
 
-  static toAST(input: string): AST {
-    return new Parser(input).toAST();
+  lookahead(distance: u32): u32 {
+    return this.sourceString.charCodeAt(this.cursor + distance);
   }
 
-  private eatToken(value: u32 = -1): u32 {
-    const token = this.currentToken.charCodeAt(0) as u32;
-    if (value != -1 && token != value) {
-      throw new Error("invalid token");
+  next(): bool {
+    this.cursor++;
+    if (this.cursor >= u32(this.sourceString.length)) {
+      return false;
     }
-    this.currentToken = this.input.charAt(++this.cursor);
-    return token;
+    this.current = this.sourceString.charCodeAt(this.cursor);
+    return true;
+  }
+
+  currentAsString(): string {
+    return String.fromCharCode(this.current);
+  }
+
+  more(): bool {
+    return this.cursor < u32(this.sourceString.length);
+  }
+
+  copy(): StringIterator {
+    const iterator = new StringIterator(this.sourceString);
+    iterator.cursor = this.cursor;
+    iterator.current = this.current;
+    return iterator;
   }
+}
+
+export class Parser {
+  // currentToken: string = "";
+  // cursor: u32 = 0;
+  iterator: StringIterator;
 
-  private more(): bool {
-    return this.currentToken.length > 0;
+  private constructor(input: string) {
+    this.iterator = new StringIterator(input);
+  }
+
+  static toAST(input: string): AST {
+    return new Parser(input).toAST();
   }
 
-  private resetCursor(): void {
-    this.cursor = 0;
-    this.currentToken = this.input.charAt(0);
+  private eatToken(value: u32 = -1): u32 {
+    const currentToken = this.iterator.current;
+    if (value != -1 && this.iterator.current != value) {
+      throw new Error("invalid token");
+    }
+    this.iterator.next();
+    return currentToken;
   }
 
   private toAST(): AST {
-    this.resetCursor();
     return new AST(this.parseSequence());
   }
 
+  private currentCharCode(): u32 {
+    return this.iterator.current;
+  }
+
   private parseCharacter(): Node {
-    let token = this.currentToken.charCodeAt(0);
+    let token = this.iterator.current;
     if (token == Char.Backslash) {
       this.eatToken(Char.Backslash);
-      token = this.currentToken.charCodeAt(0);
+      token = this.iterator.current;
       if (isSpecialCharacter(token)) {
         this.eatToken();
         return new CharacterNode(token);
@@ -120,20 +154,20 @@ export class Parser {
 
   private maybeParseRepetitionRange(): Range {
     // snapshot
-    const previousCursor = this.cursor;
+    const iteratorCopy = this.iterator.copy();
     this.eatToken(Char.LeftCurlyBrace);
 
     let range = new Range();
 
     let firstDigit = true;
     let digitStr = "";
-    while (this.more()) {
-      const token = this.currentToken.charCodeAt(0);
+    while (this.iterator.more()) {
+      const token = this.iterator.current;
       if (token == Char.RightParenthesis) break;
       if (firstDigit) {
         if (isDigit(token)) {
           // if it is a digit, keep eating
-          digitStr += this.currentToken;
+          digitStr += this.iterator.currentAsString();
         } else {
           range.from = digitStr.length ? <i32>parseInt(digitStr) : -1;
           range.to = range.from;
@@ -154,7 +188,7 @@ export class Parser {
       } else {
         if (isDigit(token)) {
           // if it is a digit, keep eating
-          digitStr += this.currentToken;
+          digitStr += this.iterator.currentAsString();
         } else {
           range.to = digitStr.length ? <i32>parseInt(digitStr) : -1;
           if (token == Char.RightCurlyBrace) {
@@ -171,8 +205,7 @@ export class Parser {
     }
 
     // repetition not found - reset state
-    this.cursor = previousCursor;
-    this.currentToken = this.input.charAt(previousCursor);
+    this.iterator = iteratorCopy;
 
     return range;
   }
@@ -180,8 +213,8 @@ export class Parser {
   // parses a sequence of chars
   private parseSequence(): Node {
     let nodes = new Array<Node>();
-    while (this.more()) {
-      const token = this.currentToken.charCodeAt(0);
+    while (this.iterator.more()) {
+      const token = this.iterator.current;
       if (token == Char.RightParenthesis) break;
       // @ts-ignore
       if (token == Char.VerticalBar) {
@@ -227,34 +260,35 @@ export class Parser {
 
   private parseCharacterSet(): CharacterSetNode {
     this.eatToken(Char.LeftSquareBracket);
-    const token = this.currentToken.charCodeAt(0);
 
-    const negated = token == Char.Caret;
+    const negated = this.iterator.current == Char.Caret;
     if (negated) {
       this.eatToken(Char.Caret);
     }
 
     const nodes = new Array<Node>();
-    while (this.currentToken != "]" || nodes.length == 0) {
+    while (
+      this.iterator.current != Char.RightSquareBracket ||
+      nodes.length == 0
+    ) {
       // lookahead for character range
       if (
-        this.cursor + 2 < u32(this.input.length) &&
-        this.currentToken != "\\" &&
-        this.input.charCodeAt(this.cursor + 1) == Char.Minus &&
-        this.input.charCodeAt(this.cursor + 2) != Char.RightSquareBracket
+        this.iterator.current != Char.Backslash &&
+        this.iterator.lookahead(1) == Char.Minus &&
+        this.iterator.lookahead(2) != Char.RightSquareBracket
       ) {
         nodes.push(this.parseCharacterRange());
       } else {
         if (
-          this.currentToken == "\\" &&
-          isCharacterSetSpecialChar(this.input.charCodeAt(this.cursor + 1))
+          this.iterator.current == Char.Backslash &&
+          isCharacterSetSpecialChar(this.iterator.lookahead(1))
         ) {
           this.eatToken(Char.Backslash);
         }
         nodes.push(new CharacterNode(this.eatToken()));
       }
 
-      if (this.cursor >= u32(this.input.length)) {
+      if (!this.iterator.more()) {
         throw new SyntaxError("Unterminated character class");
       }
     }
diff --git a/ts/index.ts b/ts/index.ts
index 2d42465..f2ecd61 100644
--- a/ts/index.ts
+++ b/ts/index.ts
@@ -5,7 +5,7 @@ globalAny.log = console.log;
 
 import { RegExp } from "../assembly/regexp";
 
-const regexObj = new RegExp("[a\\\\c]");
-const match = regexObj.exec("\\");
+const regexObj = new RegExp("[abce]");
+const match = regexObj.exec("a");
 
 console.log(match);

From bc8bc1fb1bcf308a06c0b011d13f75b016a60b96 Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 15:57:57 +0000
Subject: [PATCH 6/7] string iterator into new file

---
 assembly/parser/parser.ts          | 40 +-----------------------------
 assembly/parser/string-iterator.ts | 36 +++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 39 deletions(-)
 create mode 100644 assembly/parser/string-iterator.ts

diff --git a/assembly/parser/parser.ts b/assembly/parser/parser.ts
index be8db87..a9d1017 100644
--- a/assembly/parser/parser.ts
+++ b/assembly/parser/parser.ts
@@ -1,4 +1,5 @@
 import { isDigit, Char } from "../char";
+import { StringIterator } from "./string-iterator";
 import {
   AST,
   RangeRepetitionNode,
@@ -58,46 +59,7 @@ class Range {
   to: i32 = -1;
 }
 
-class StringIterator {
-  current: u32;
-  cursor: u32 = 0;
-
-  constructor(private sourceString: string) {
-    this.current = this.sourceString.charCodeAt(0);
-  }
-
-  lookahead(distance: u32): u32 {
-    return this.sourceString.charCodeAt(this.cursor + distance);
-  }
-
-  next(): bool {
-    this.cursor++;
-    if (this.cursor >= u32(this.sourceString.length)) {
-      return false;
-    }
-    this.current = this.sourceString.charCodeAt(this.cursor);
-    return true;
-  }
-
-  currentAsString(): string {
-    return String.fromCharCode(this.current);
-  }
-
-  more(): bool {
-    return this.cursor < u32(this.sourceString.length);
-  }
-
-  copy(): StringIterator {
-    const iterator = new StringIterator(this.sourceString);
-    iterator.cursor = this.cursor;
-    iterator.current = this.current;
-    return iterator;
-  }
-}
-
 export class Parser {
-  // currentToken: string = "";
-  // cursor: u32 = 0;
   iterator: StringIterator;
 
   private constructor(input: string) {
diff --git a/assembly/parser/string-iterator.ts b/assembly/parser/string-iterator.ts
new file mode 100644
index 0000000..fbd49b7
--- /dev/null
+++ b/assembly/parser/string-iterator.ts
@@ -0,0 +1,36 @@
+export class StringIterator {
+  current: u32;
+  cursor: u32 = 0;
+
+  constructor(private sourceString: string) {
+    this.current = this.sourceString.charCodeAt(0);
+  }
+
+  lookahead(distance: u32): u32 {
+    return this.sourceString.charCodeAt(this.cursor + distance);
+  }
+
+  next(): bool {
+    this.cursor++;
+    if (this.cursor >= u32(this.sourceString.length)) {
+      return false;
+    }
+    this.current = this.sourceString.charCodeAt(this.cursor);
+    return true;
+  }
+
+  currentAsString(): string {
+    return String.fromCharCode(this.current);
+  }
+
+  more(): bool {
+    return this.cursor < u32(this.sourceString.length);
+  }
+
+  copy(): StringIterator {
+    const iterator = new StringIterator(this.sourceString);
+    iterator.cursor = this.cursor;
+    iterator.current = this.current;
+    return iterator;
+  }
+}

From 267b948c61d3c6e599fd058903b56d8f7f1dc9f5 Mon Sep 17 00:00:00 2001
From: Colin E <colin.eberhardt@gmail.com>
Date: Mon, 25 Jan 2021 16:03:36 +0000
Subject: [PATCH 7/7] updated to use NodeType

---
 assembly/nfa/matcher.ts | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/assembly/nfa/matcher.ts b/assembly/nfa/matcher.ts
index 9c32f0c..93a4067 100644
--- a/assembly/nfa/matcher.ts
+++ b/assembly/nfa/matcher.ts
@@ -5,6 +5,8 @@ import {
   CharacterSetNode,
   CharacterClassNode,
   CharacterRangeNode,
+  NodeType,
+  Node,
 } from "../parser/node";
 import { Match } from "../regexp";
 
@@ -27,12 +29,13 @@ export class Matcher {
 
   static fromCharacterSetNode(node: CharacterSetNode): CharacterSetMatcher {
     const matchers = node.expressions.map<Matcher>((exp) => {
-      if (CharacterRangeNode.is(exp)) {
-        return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode);
-      } else if (CharacterNode.is(exp)) {
-        return Matcher.fromCharacterNode(exp as CharacterNode);
-      } else {
-        throw new Error("unsupported node type within character set");
+      switch (exp.type) {
+        case NodeType.CharacterRange:
+          return Matcher.fromCharacterRangeNode(exp as CharacterRangeNode);
+        case NodeType.Character:
+          return Matcher.fromCharacterNode(exp as CharacterNode);
+        default:
+          throw new Error("unsupported node type within character set");
       }
     });
     return new CharacterSetMatcher(matchers, node.negated);