Skip to content

Commit f50e8dd

Browse files
committed
More string specializations and performance improvements
1 parent 68adc70 commit f50e8dd

File tree

6 files changed

+59
-51
lines changed

6 files changed

+59
-51
lines changed

bench/Main.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,15 +147,15 @@ main = do
147147
$ \_ -> runParser smallJson BenchParsing.json
148148

149149
log "StringParser.runParser json smallJson"
150-
benchWith 500
150+
benchWith 1000
151151
$ \_ -> StringParser.runParser BenchStringParser.json smallJson
152152

153153
log "runParser json mediumJson"
154154
benchWith 500
155155
$ \_ -> runParser mediumJson BenchParsing.json
156156

157157
log "StringParser.runParser json mediumJson"
158-
benchWith 1000
158+
benchWith 500
159159
$ \_ -> StringParser.runParser BenchStringParser.json mediumJson
160160

161161
log "runParser json largeJson"

spago-dev.dhall

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ in conf //
1212
, dependencies = conf.dependencies #
1313
[ "assert"
1414
, "console"
15+
, "enums"
1516
, "effect"
1617
, "psci-support"
1718
, "minibench"

spago.dhall

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
[ "arrays"
66
, "control"
77
, "either"
8+
, "enums"
89
, "foldable-traversable"
910
, "functions"
1011
, "identity"
@@ -23,7 +24,6 @@
2324
, "tuples"
2425
, "unfoldable"
2526
, "unicode"
26-
, "unsafe-coerce"
2727
]
2828
, packages = ./packages.dhall
2929
, sources = [ "src/**/*.purs" ]

src/Text/Parsing/Parser.purs

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -205,21 +205,20 @@ instance Bind (ParserT s m) where
205205
instance Monad (ParserT s m)
206206

207207
instance MonadRec (ParserT s m) where
208-
tailRecM next = go
209-
where
210-
go a = ParserT
211-
( mkFn5 \state1 more lift throw done ->
212-
more \_ -> do
213-
let (ParserT k1) = next a
214-
runFn5 k1 state1 more lift throw
215-
( mkFn2 \state2 step -> case step of
216-
Loop b -> do
217-
let (ParserT k2) = go b
218-
runFn5 k2 state2 more lift throw done
219-
Done c ->
220-
runFn2 done state2 c
208+
tailRecM next initArg = ParserT
209+
( mkFn5 \state1 more lift throw done -> do
210+
let
211+
loop = mkFn2 \state2 arg -> do
212+
let (ParserT k1) = next arg
213+
runFn5 k1 state2 more lift throw
214+
( mkFn2 \state3 step -> case step of
215+
Loop nextArg ->
216+
runFn2 loop state3 nextArg
217+
Done res ->
218+
runFn2 done state3 res
221219
)
222-
)
220+
runFn2 loop state1 initArg
221+
)
223222

224223
instance MonadState (ParseState s) (ParserT s m) where
225224
state k = ParserT

src/Text/Parsing/Parser/Combinators.purs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ import Prelude
100100
import Control.Lazy (defer)
101101
import Control.Monad.Rec.Class (Step(..), tailRecM)
102102
import Control.Plus (empty, (<|>), alt)
103-
import Data.Foldable (class Foldable, foldl)
103+
import Data.Foldable (class Foldable, foldl, foldr)
104104
import Data.Function.Uncurried (mkFn2, mkFn5, runFn2, runFn5)
105105
import Data.List (List(..), many, manyRec, reverse, (:))
106106
import Data.List.NonEmpty (NonEmptyList, cons')
@@ -442,7 +442,7 @@ chainr1Rec p f = do
442442

443443
-- | Parse one of a set of alternatives.
444444
choice :: forall f m s a. Foldable f => f (ParserT s m a) -> ParserT s m a
445-
choice = foldl (<|>) empty
445+
choice = foldr (<|>) empty
446446

447447
-- | Skip many instances of a phrase.
448448
skipMany :: forall s a m. ParserT s m a -> ParserT s m Unit

src/Text/Parsing/Parser/String.purs

Lines changed: 40 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -45,23 +45,23 @@ import Prelude hiding (between)
4545
import Control.Monad.State (get, put, state)
4646
import Data.Array (notElem)
4747
import Data.Array.NonEmpty as NonEmptyArray
48-
import Data.Char (fromCharCode)
4948
import Data.CodePoint.Unicode (isSpace)
5049
import Data.Either (Either(..))
50+
import Data.Enum (fromEnum, toEnum)
5151
import Data.Foldable (elem)
5252
import Data.Function.Uncurried (mkFn5, runFn2)
53-
import Data.Maybe (Maybe(..))
54-
import Data.String (CodePoint, Pattern(..), length, null, singleton, splitAt, stripPrefix, uncons)
53+
import Data.Maybe (Maybe(..), fromJust)
54+
import Data.String (CodePoint, Pattern(..), codePointAt, length, null, singleton, splitAt, stripPrefix, takeWhile, uncons)
5555
import Data.String.CodeUnits as SCU
5656
import Data.String.Regex as Regex
5757
import Data.String.Regex.Flags (RegexFlags(..), RegexFlagsRec)
5858
import Data.Tuple (Tuple(..), fst)
59+
import Partial.Unsafe (unsafePartial)
5960
import Prim.Row (class Nub, class Union)
6061
import Record (merge)
6162
import Text.Parsing.Parser (ParseError(..), ParseState(..), ParserT(..), fail)
62-
import Text.Parsing.Parser.Combinators (skipMany, tryRethrow, (<?>), (<~?>))
63+
import Text.Parsing.Parser.Combinators (tryRethrow, (<?>), (<~?>))
6364
import Text.Parsing.Parser.Pos (Position(..))
64-
import Unsafe.Coerce (unsafeCoerce)
6565

6666
-- | Match “end-of-file,” the end of the input stream.
6767
eof :: forall m. ParserT String m Unit
@@ -93,26 +93,33 @@ string str = ParserT
9393
-- | Match any BMP `Char`.
9494
-- | Parser will fail if the character is not in the Basic Multilingual Plane.
9595
anyChar :: forall m. ParserT String m Char
96-
anyChar = tryRethrow do
97-
cp :: Int <- unCodePoint <$> anyCodePoint
98-
-- the `fromCharCode` function doesn't check if this is beyond the
99-
-- BMP, so we check that ourselves.
100-
-- https://github.com/purescript/purescript-strings/issues/153
101-
if cp > 65535 -- BMP
102-
then fail "Not a Char"
103-
else case fromCharCode cp of
104-
Nothing -> fail "Not a Char"
105-
Just c -> pure c
96+
anyChar = ParserT
97+
( mkFn5 \state1@(ParseState input pos _) _ _ throw done ->
98+
case uncons input of
99+
Nothing ->
100+
runFn2 throw state1 (ParseError "Unexpected EOF" pos)
101+
Just { head, tail } -> do
102+
let cp = fromEnum head
103+
-- the `fromCharCode` function doesn't check if this is beyond the
104+
-- BMP, so we check that ourselves.
105+
-- https://github.com/purescript/purescript-strings/issues/153
106+
if cp < 0 || cp > 65535 then
107+
runFn2 throw state1 (ParseError "Expected Char" pos)
108+
else
109+
runFn2 done (ParseState tail (updatePosSingle pos head) true) (unsafePartial fromJust (toEnum cp))
110+
)
106111

107112
-- | Match any Unicode character.
108113
-- | Always succeeds.
109114
anyCodePoint :: forall m. ParserT String m CodePoint
110-
anyCodePoint = join $ state \state1@(ParseState input position _) ->
111-
case uncons input of
112-
Nothing ->
113-
Tuple (fail "Unexpected EOF") state1
114-
Just { head, tail } ->
115-
Tuple (pure head) (ParseState tail (updatePosSingle position head) true)
115+
anyCodePoint = ParserT
116+
( mkFn5 \state1@(ParseState input pos _) _ _ throw done ->
117+
case uncons input of
118+
Nothing ->
119+
runFn2 throw state1 (ParseError "Unexpected EOF" pos)
120+
Just { head, tail } ->
121+
runFn2 done (ParseState tail (updatePosSingle pos head) true) head
122+
)
116123

117124
-- | Match a BMP `Char` satisfying the predicate.
118125
satisfy :: forall m. (Char -> Boolean) -> ParserT String m Char
@@ -148,7 +155,12 @@ whiteSpace = fst <$> match skipSpaces
148155

149156
-- | Skip whitespace characters and throw them away. Always succeeds.
150157
skipSpaces :: forall m. ParserT String m Unit
151-
skipSpaces = skipMany (satisfyCodePoint isSpace)
158+
skipSpaces = ParserT
159+
( mkFn5 \(ParseState input pos _) _ _ _ done -> do
160+
let head = takeWhile isSpace input
161+
let tail = SCU.drop (SCU.length head) input
162+
runFn2 done (ParseState tail (updatePosString pos head) true) unit
163+
)
152164

153165
-- | Match one of the BMP `Char`s in the array.
154166
oneOf :: forall m. Array Char -> ParserT String m Char
@@ -168,14 +180,16 @@ noneOfCodePoints ss = satisfyCodePoint (flip notElem ss) <~?> \_ -> "none of " <
168180

169181
-- | Updates a `Position` by adding the columns and lines in `String`.
170182
updatePosString :: Position -> String -> Position
171-
updatePosString pos str = case uncons str of
172-
Nothing -> pos
173-
Just { head, tail } -> updatePosString (updatePosSingle pos head) tail -- tail recursive
183+
updatePosString = go 0
184+
where
185+
go ix pos str = case codePointAt ix str of
186+
Nothing -> pos
187+
Just cp -> go (ix + 1) (updatePosSingle pos cp) str
174188

175189
-- | Updates a `Position` by adding the columns and lines in a
176190
-- | single `CodePoint`.
177191
updatePosSingle :: Position -> CodePoint -> Position
178-
updatePosSingle (Position { line, column }) cp = case unCodePoint cp of
192+
updatePosSingle (Position { line, column }) cp = case fromEnum cp of
179193
10 -> Position { line: line + 1, column: 1 } -- "\n"
180194
13 -> Position { line: line + 1, column: 1 } -- "\r"
181195
9 -> Position { line, column: column + 8 - ((column - 1) `mod` 8) } -- "\t" Who says that one tab is 8 columns?
@@ -211,12 +225,6 @@ match p = do
211225
-- boundary.
212226
pure $ Tuple (SCU.take (SCU.length input1 - SCU.length input2) input1) x
213227

214-
-- | The CodePoint newtype constructor is not exported, so here's a helper.
215-
-- | This will break at runtime if the definition of CodePoint ever changes
216-
-- | to something other than `newtype CodePoint = CodePoint Int`.
217-
unCodePoint :: CodePoint -> Int
218-
unCodePoint = unsafeCoerce
219-
220228
-- | Parser which uses the `Data.String.Regex` module to match the regular
221229
-- | expression pattern passed as the `String`
222230
-- | argument to the parser.

0 commit comments

Comments
 (0)