Skip to content

Commit ef78b90

Browse files
committed
Implement Handlebars lexer
This should produce the exact same tokens for any template as the official Handlebars JS implementation.
1 parent cb148cc commit ef78b90

File tree

5 files changed

+373
-0
lines changed

5 files changed

+373
-0
lines changed

src/Lexer.php

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars;
4+
5+
use DevTheorem\Handlebars\Phlexer\Phlexer;
6+
use DevTheorem\Handlebars\Phlexer\Rule;
7+
8+
/**
9+
* Implements the same lexical tokenization from
10+
* https://github.com/handlebars-lang/handlebars-parser/blob/master/src/handlebars.l
11+
* (as of 2025-01-07).
12+
*/
13+
final class Lexer extends Phlexer
14+
{
15+
public function __construct()
16+
{
17+
$LEFT_STRIP = $RIGHT_STRIP = '~';
18+
$LOOKAHEAD = '[=~}\\s\\/.)\\]|]';
19+
$LITERAL_LOOKAHEAD = '[~}\\s)\\]]';
20+
21+
/*
22+
* ID is the inverse of control characters.
23+
* Control characters ranges:
24+
* [\s] Whitespace
25+
* [!"#%-,\./] !, ", #, %, &, ', (, ), *, +, ,, ., /, Exceptions in range: $, -
26+
* [;->@] ;, <, =, >, @, Exceptions in range: :, ?
27+
* [\[-\^`] [, \, ], ^, `, Exceptions in range: _
28+
* [\{-~] {, |, }, ~
29+
*/
30+
$CTRL_INVERSE = '[^\\s!"#%-,\\.\\/;->@\\[-\\^`\\{-~]+';
31+
$ID = $CTRL_INVERSE . '(?=' . $LOOKAHEAD . ')';
32+
33+
parent::__construct([
34+
new Rule('[^\\x00]*?(?={{)', function () {
35+
if (str_ends_with($this->yytext, "\\\\")) {
36+
$this->strip(0, 1);
37+
$this->pushState('mu');
38+
} elseif (str_ends_with($this->yytext, "\\")) {
39+
$this->strip(0, 1);
40+
$this->pushState('emu');
41+
} else {
42+
$this->pushState('mu');
43+
}
44+
45+
return $this->yytext !== '' ? 'CONTENT' : null;
46+
}),
47+
48+
new Rule('[^\\x00]+', fn() => 'CONTENT'),
49+
50+
// marks CONTENT up to the next mustache or escaped mustache
51+
new Rule('<emu>[^\\x00]{2,}?(?={{|\\\\{{|\\\\\\\\{{|\\Z)', function () {
52+
$this->popState();
53+
return 'CONTENT';
54+
}),
55+
56+
// nested raw block will create stacked 'raw' condition
57+
new Rule('<raw>{{{{(?=[^\\/])', function () {
58+
$this->pushState('raw');
59+
return 'CONTENT';
60+
}),
61+
62+
new Rule('<raw>{{{{\\/' . $CTRL_INVERSE . '(?=[=}\\s\\/.])}}}}', function () {
63+
$this->popState();
64+
65+
if ($this->topState() === 'raw') {
66+
return 'CONTENT';
67+
} else {
68+
$this->strip(5, 9);
69+
return 'END_RAW_BLOCK';
70+
}
71+
}),
72+
new Rule('<raw>[^\\x00]+?(?={{{{)', fn() => 'CONTENT'),
73+
74+
new Rule('<com>[\\s\\S]*?--' . $RIGHT_STRIP . '?}}', function () {
75+
$this->popState();
76+
return 'COMMENT';
77+
}),
78+
79+
new Rule('<mu>\\(', fn() => 'OPEN_SEXPR'),
80+
new Rule('<mu>\\)', fn() => 'CLOSE_SEXPR'),
81+
82+
new Rule('<mu>\\[', function () {
83+
// Assuming yy.syntax.square === 'string'. OPEN_ARRAY option not handled
84+
$this->unput($this->yytext);
85+
// escaped literal
86+
$this->pushState('escl');
87+
return null;
88+
}),
89+
new Rule('<mu>]', fn() => 'CLOSE_ARRAY'),
90+
91+
new Rule('<mu>{{{{', fn() => 'OPEN_RAW_BLOCK'),
92+
new Rule('<mu>}}}}', function () {
93+
$this->popState();
94+
$this->pushState('raw');
95+
return 'CLOSE_RAW_BLOCK';
96+
}),
97+
new Rule('<mu>{{' . $LEFT_STRIP . '?>', fn() => 'OPEN_PARTIAL'),
98+
new Rule('<mu>{{' . $LEFT_STRIP . '?#>', fn() => 'OPEN_PARTIAL_BLOCK'),
99+
new Rule('<mu>{{' . $LEFT_STRIP . '?#\\*?', fn() => 'OPEN_BLOCK'),
100+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\/', fn() => 'OPEN_ENDBLOCK'),
101+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\^\\s*' . $RIGHT_STRIP . '?}}', function () {
102+
$this->popState();
103+
return 'INVERSE';
104+
}),
105+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\s*else\\s*' . $RIGHT_STRIP . '?}}', function () {
106+
$this->popState();
107+
return 'INVERSE';
108+
}),
109+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\^', fn() => 'OPEN_INVERSE'),
110+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\s*else', fn() => 'OPEN_INVERSE_CHAIN'),
111+
new Rule('<mu>{{' . $LEFT_STRIP . '?{', fn() => 'OPEN_UNESCAPED'),
112+
new Rule('<mu>{{' . $LEFT_STRIP . '?&', fn() => 'OPEN'),
113+
new Rule('<mu>{{' . $LEFT_STRIP . '?!--', function () {
114+
$this->unput($this->yytext);
115+
$this->popState();
116+
$this->pushState('com');
117+
return null;
118+
}),
119+
new Rule('<mu>{{' . $LEFT_STRIP . '?![\\s\\S]*?}}', function () {
120+
$this->popState();
121+
return 'COMMENT';
122+
}),
123+
new Rule('<mu>{{' . $LEFT_STRIP . '?\\*?', fn() => 'OPEN'),
124+
125+
new Rule('<mu>=', fn() => 'EQUALS'),
126+
new Rule('<mu>\\.\\.', fn() => 'ID'),
127+
new Rule('<mu>\\.(?=' . $LOOKAHEAD . ')', fn() => 'ID'),
128+
new Rule('<mu>\\.#', fn() => 'PRIVATE_SEP'),
129+
new Rule('<mu>[\\/.]', fn() => 'SEP'),
130+
new Rule('<mu>\\s+', fn() => null), // ignore whitespace
131+
new Rule('<mu>}' . $RIGHT_STRIP . '?}}', function () {
132+
$this->popState();
133+
return 'CLOSE_UNESCAPED';
134+
}),
135+
new Rule('<mu>' . $RIGHT_STRIP . '?}}', function () {
136+
$this->popState();
137+
return 'CLOSE';
138+
}),
139+
// double-quoted string
140+
new Rule('<mu>"(\\\\["]|[^"])*"', function () {
141+
$this->strip(1, 2);
142+
$this->replace('/\\\\"/', '"');
143+
return 'STRING';
144+
}),
145+
// single quoted string
146+
new Rule("<mu>'(\\\\[']|[^'])*'", function () {
147+
$this->strip(1, 2);
148+
$this->replace("/\\\\'/", "'");
149+
return 'STRING';
150+
}),
151+
new Rule('<mu>@', fn() => 'DATA'),
152+
new Rule('<mu>true(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'BOOLEAN'),
153+
new Rule('<mu>false(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'BOOLEAN'),
154+
new Rule('<mu>undefined(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'UNDEFINED'),
155+
new Rule('<mu>null(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'NULL'),
156+
new Rule('<mu>\\-?[0-9]+(?:\\.[0-9]+)?(?=' . $LITERAL_LOOKAHEAD . ')', fn() => 'NUMBER'),
157+
new Rule('<mu>as\\s+\\|', fn() => 'OPEN_BLOCK_PARAMS'),
158+
new Rule('<mu>\\|', fn() => 'CLOSE_BLOCK_PARAMS'),
159+
160+
new Rule('<mu>' . $ID, fn() => 'ID'),
161+
162+
new Rule('<escl>\\[(\\\\\\]|[^\\]])*\\]', function () {
163+
$this->replace('/\\\\([\\\\\\]])/', '$1');
164+
$this->popState();
165+
return 'ID';
166+
}),
167+
168+
new Rule('<mu>.', fn() => 'INVALID'),
169+
170+
new Rule('<INITIAL,mu>\\Z', fn() => 'EOF'),
171+
]);
172+
}
173+
174+
private function strip(int $start, int $end): void
175+
{
176+
$this->yytext = substr($this->yytext, $start, strlen($this->yytext) - $end);
177+
}
178+
179+
private function replace(string $pattern, string $replacement): void
180+
{
181+
$result = preg_replace($pattern, $replacement, $this->yytext);
182+
183+
if ($result === null) {
184+
throw new \Exception('Failed to replace string: ' . preg_last_error_msg());
185+
}
186+
187+
$this->yytext = $result;
188+
}
189+
}

src/Phlexer/Phlexer.php

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
abstract class Phlexer
6+
{
7+
const INITIAL_STATE = 'INITIAL';
8+
9+
/**
10+
* @var string[]
11+
*/
12+
private array $states = [self::INITIAL_STATE];
13+
private string $text;
14+
15+
/**
16+
* The current matched value
17+
*/
18+
protected string $yytext = '';
19+
20+
/**
21+
* @param Rule[] $rules
22+
*/
23+
public function __construct(
24+
protected array $rules,
25+
protected string $eofToken = 'EOF',
26+
) {}
27+
28+
/**
29+
* @return Token[]
30+
*/
31+
public function tokenize(string $text): array
32+
{
33+
$this->text = $text;
34+
$tokens = [];
35+
36+
while (true) {
37+
$token = $this->getNextToken();
38+
39+
if ($token === null) {
40+
continue;
41+
} elseif ($token->name ===$this->eofToken) {
42+
break;
43+
} else {
44+
$tokens[] = $token;
45+
}
46+
}
47+
48+
return $tokens;
49+
}
50+
51+
protected function getNextToken(): ?Token
52+
{
53+
if ($this->text === '') {
54+
return new Token($this->eofToken, $this->text);
55+
}
56+
57+
foreach ($this->rules as $rule) {
58+
if (!$rule->hasStartCondition($this->topState())) {
59+
continue;
60+
}
61+
62+
if (preg_match("/\\A{$rule->pattern}/", $this->text, $matches)) {
63+
$this->yytext = $matches[0];
64+
$this->text = substr($this->text, strlen($this->yytext));
65+
$tokenName = ($rule->handler)();
66+
67+
if ($tokenName === null) {
68+
return null; // e.g. to ignore whitespace or change state
69+
}
70+
71+
return new Token($tokenName, $this->yytext);
72+
}
73+
}
74+
75+
throw new \Exception("Unexpected token: \"{$this->text[0]}\"");
76+
}
77+
78+
protected function pushState(string $state): void
79+
{
80+
$this->states[] = $state;
81+
}
82+
83+
protected function popState(): void
84+
{
85+
array_pop($this->states);
86+
}
87+
88+
protected function topState(): string
89+
{
90+
$lastKey = array_key_last($this->states);
91+
92+
if ($lastKey === null) {
93+
return self::INITIAL_STATE;
94+
}
95+
96+
return $this->states[$lastKey];
97+
}
98+
99+
protected function unput(string $chars): void
100+
{
101+
$this->text = $chars . $this->text;
102+
}
103+
}

src/Phlexer/Rule.php

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
readonly class Rule
6+
{
7+
public string $pattern;
8+
9+
/**
10+
* @var string[]
11+
*/
12+
public array $startConditions;
13+
14+
/**
15+
* @param \Closure(): ?string $handler
16+
*/
17+
public function __construct(
18+
string $pattern,
19+
public \Closure $handler,
20+
) {
21+
// if pattern starts with <start,conditions>, move to separate array
22+
if (preg_match('/^<([a-z]+(,[a-z]+)?)>/', $pattern, $matches)) {
23+
$this->pattern = substr($pattern, strlen($matches[0]));
24+
$this->startConditions = explode(',', $matches[1]);
25+
} else {
26+
$this->pattern = $pattern;
27+
$this->startConditions = [Phlexer::INITIAL_STATE];
28+
}
29+
}
30+
31+
public function hasStartCondition(string $condition): bool
32+
{
33+
return in_array($condition, $this->startConditions, true);
34+
}
35+
}

src/Phlexer/Token.php

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Phlexer;
4+
5+
readonly class Token
6+
{
7+
public function __construct(
8+
public string $name,
9+
public string $text,
10+
) {}
11+
}

test/LexerTest.php

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
<?php
2+
3+
namespace DevTheorem\Handlebars\Test;
4+
5+
use DevTheorem\Handlebars\Lexer;
6+
use DevTheorem\Handlebars\Phlexer\Token;
7+
use PHPUnit\Framework\Attributes\DataProvider;
8+
use PHPUnit\Framework\TestCase;
9+
10+
class LexerTest extends TestCase
11+
{
12+
public static function jsonSpecProvider(): array
13+
{
14+
$filename = 'vendor/jbboehr/handlebars-spec/spec/tokenizer.json';
15+
$json = json_decode(file_get_contents($filename), true);
16+
return array_map(fn(array $d): array => [$d], $json);
17+
}
18+
19+
#[DataProvider("jsonSpecProvider")]
20+
public function testSpecs(array $spec): void
21+
{
22+
// fix invalid expectations
23+
if ($spec['it'] === 'does not time out in a mustache with a single } followed by EOF') {
24+
$spec['expected'][] = ['name' => 'INVALID', 'text' => '}'];
25+
} elseif ($spec['it'] === 'does not time out in a mustache when invalid ID characters are used') {
26+
$spec['expected'][] = ['name' => 'INVALID', 'text' => '&'];
27+
$spec['expected'][] = ['name' => 'CLOSE', 'text' => '}}'];
28+
}
29+
30+
$lexer = new Lexer();
31+
$toJson = fn(Token $t) => ['name' => $t->name, 'text' => $t->text];
32+
$actual = array_map($toJson, $lexer->tokenize($spec['template']));
33+
$this->assertSame($spec['expected'], $actual);
34+
}
35+
}

0 commit comments

Comments
 (0)