From c93fc44fa176bd8936bff21473e3fff602200f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Sat, 31 Aug 2024 17:15:04 +0800 Subject: [PATCH 1/8] allow Chinese variable --- parse/lexer.go | 63 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 13 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index c20a0bdd..aef7944f 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -8,6 +8,8 @@ import ( "reflect" "strconv" "strings" + "unicode" + "unicode/utf8" "github.com/yuin/gopher-lua/ast" ) @@ -31,7 +33,8 @@ func (e *Error) Error() string { } } -func writeChar(buf *bytes.Buffer, c int) { buf.WriteByte(byte(c)) } +func writeChar(buf *bytes.Buffer, c int) { buf.WriteByte(byte(c)) } +func writeRune(buf *bytes.Buffer, c rune) { buf.WriteRune(c) } func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' } @@ -39,6 +42,11 @@ func isIdent(ch int, pos int) bool { return ch == '_' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z' || isDecimal(ch) && pos > 0 } +// isChinese +func isChinese(ch rune) bool { + return unicode.Is(unicode.Han, ch) +} + func isDigit(ch int) bool { return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F' } @@ -64,7 +72,7 @@ func (sc *Scanner) Error(tok string, msg string) *Error { return &Error{sc.Pos, func (sc *Scanner) TokenError(tok ast.Token, msg string) *Error { return &Error{tok.Pos, msg, tok.Str} } func (sc *Scanner) readNext() int { - ch, err := sc.reader.ReadByte() + ch, _, err := sc.reader.ReadRune() if err == io.EOF { return EOF } @@ -79,21 +87,25 @@ func (sc *Scanner) Newline(ch int) { sc.Pos.Column = 0 next := sc.Peek() if ch == '\n' && next == '\r' || ch == '\r' && next == '\n' { - sc.reader.ReadByte() + sc.reader.ReadRune() } } func (sc *Scanner) Next() int { ch := sc.readNext() - switch ch { - case '\n', '\r': - sc.Newline(ch) - ch = int('\n') - case EOF: - sc.Pos.Line = EOF - sc.Pos.Column = 0 - default: - sc.Pos.Column++ + if isChinese(rune(ch)) { + sc.Pos.Column += utf8.RuneLen(rune(ch)) + } else { + switch ch { + case '\n', '\r': + sc.Newline(ch) + ch = int('\n') + case EOF: + sc.Pos.Line = EOF + sc.Pos.Column = 0 + default: + sc.Pos.Column++ + } } return ch } @@ -101,7 +113,7 @@ func (sc *Scanner) Next() int { func (sc *Scanner) Peek() int { ch := sc.readNext() if ch != EOF { - sc.reader.UnreadByte() + sc.reader.UnreadRune() } return ch } @@ -142,6 +154,21 @@ func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { return nil } +// scanChinese +func (sc *Scanner) scanChinese(ch int, buf *bytes.Buffer) error { + writeRune(buf, rune(ch)) + for isChinese(rune(sc.Peek())) || isIdent(sc.Peek(), 1) { + if isChinese(rune(sc.Peek())) { + writeRune(buf, rune(sc.Next())) + } else { + if isIdent(sc.Peek(), 1) { + writeChar(buf, sc.Next()) + } + } + } + return nil +} + func (sc *Scanner) scanDecimal(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) for isDecimal(sc.Peek()) { @@ -312,6 +339,16 @@ redo: tok.Pos = sc.Pos switch { + case isChinese(rune(ch)): + tok.Type = TIdent + err = sc.scanChinese(ch, buf) + tok.Str = buf.String() + if err != nil { + goto finally + } + if typ, ok := reservedWords[tok.Str]; ok { + tok.Type = typ + } case isIdent(ch, 0): tok.Type = TIdent err = sc.scanIdent(ch, buf) From f50bd097ebd6abfbdd43cda41a56a15d3cd395c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Sat, 31 Aug 2024 18:15:34 +0800 Subject: [PATCH 2/8] allow mixed variable --- parse/lexer.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index aef7944f..d191af50 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -148,8 +148,14 @@ func (sc *Scanner) skipComments(ch int) error { func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) - for isIdent(sc.Peek(), 1) { - writeChar(buf, sc.Next()) + for isChinese(rune(sc.Peek())) || isIdent(sc.Peek(), 1) { + if isChinese(rune(sc.Peek())) { + writeRune(buf, rune(sc.Next())) + } else { + if isIdent(sc.Peek(), 1) { + writeChar(buf, sc.Next()) + } + } } return nil } From 00b96182d476900b125f74411a50e905367cfefa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Sat, 31 Aug 2024 18:23:10 +0800 Subject: [PATCH 3/8] add scanIdentAndChinese --- parse/lexer.go | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index d191af50..e21db3dc 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -148,6 +148,12 @@ func (sc *Scanner) skipComments(ch int) error { func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) + sc.scanIdentAndChinese(buf) + return nil +} + +// scanIdentAndChinese +func (sc *Scanner) scanIdentAndChinese(buf *bytes.Buffer) error { for isChinese(rune(sc.Peek())) || isIdent(sc.Peek(), 1) { if isChinese(rune(sc.Peek())) { writeRune(buf, rune(sc.Next())) @@ -163,15 +169,7 @@ func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { // scanChinese func (sc *Scanner) scanChinese(ch int, buf *bytes.Buffer) error { writeRune(buf, rune(ch)) - for isChinese(rune(sc.Peek())) || isIdent(sc.Peek(), 1) { - if isChinese(rune(sc.Peek())) { - writeRune(buf, rune(sc.Next())) - } else { - if isIdent(sc.Peek(), 1) { - writeChar(buf, sc.Next()) - } - } - } + sc.scanIdentAndChinese(buf) return nil } From 119574e6c5c720ad34b5b06e85bfbf5dc096130d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Mon, 2 Sep 2024 10:50:05 +0800 Subject: [PATCH 4/8] allow chinese function names --- parse/lexer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index e21db3dc..0054dcfa 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -225,7 +225,7 @@ func (sc *Scanner) scanString(quote int, buf *bytes.Buffer) error { return err } } else { - writeChar(buf, ch) + writeRune(buf, rune(ch)) } ch = sc.Next() } @@ -305,7 +305,7 @@ func (sc *Scanner) scanMultilineString(ch int, buf *bytes.Buffer) error { buf.WriteString(strings.Repeat("=", count2)) continue } - writeChar(buf, ch) + writeRune(buf, rune(ch)) ch = sc.Next() } From 783372a549eb0f6a7c63caad8ba1c9c373b357e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Mon, 2 Sep 2024 11:23:48 +0800 Subject: [PATCH 5/8] Another way to support Chinese --- parse/lexer.go | 71 +++++++++++++++----------------------------------- 1 file changed, 21 insertions(+), 50 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index 0054dcfa..ff137512 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -9,7 +9,6 @@ import ( "strconv" "strings" "unicode" - "unicode/utf8" "github.com/yuin/gopher-lua/ast" ) @@ -33,12 +32,16 @@ func (e *Error) Error() string { } } -func writeChar(buf *bytes.Buffer, c int) { buf.WriteByte(byte(c)) } -func writeRune(buf *bytes.Buffer, c rune) { buf.WriteRune(c) } +func writeChar(buf *bytes.Buffer, c int) { + buf.WriteRune(rune(c)) +} func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' } func isIdent(ch int, pos int) bool { + if isChinese(rune(ch)) { + return true + } return ch == '_' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z' || isDecimal(ch) && pos > 0 } @@ -87,25 +90,21 @@ func (sc *Scanner) Newline(ch int) { sc.Pos.Column = 0 next := sc.Peek() if ch == '\n' && next == '\r' || ch == '\r' && next == '\n' { - sc.reader.ReadRune() + sc.reader.ReadByte() } } func (sc *Scanner) Next() int { ch := sc.readNext() - if isChinese(rune(ch)) { - sc.Pos.Column += utf8.RuneLen(rune(ch)) - } else { - switch ch { - case '\n', '\r': - sc.Newline(ch) - ch = int('\n') - case EOF: - sc.Pos.Line = EOF - sc.Pos.Column = 0 - default: - sc.Pos.Column++ - } + switch ch { + case '\n', '\r': + sc.Newline(ch) + ch = int('\n') + case EOF: + sc.Pos.Line = EOF + sc.Pos.Column = 0 + default: + sc.Pos.Column++ } return ch } @@ -148,31 +147,13 @@ func (sc *Scanner) skipComments(ch int) error { func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) - sc.scanIdentAndChinese(buf) - return nil -} - -// scanIdentAndChinese -func (sc *Scanner) scanIdentAndChinese(buf *bytes.Buffer) error { - for isChinese(rune(sc.Peek())) || isIdent(sc.Peek(), 1) { - if isChinese(rune(sc.Peek())) { - writeRune(buf, rune(sc.Next())) - } else { - if isIdent(sc.Peek(), 1) { - writeChar(buf, sc.Next()) - } - } + fmt.Println(isIdent(sc.Peek(), 1)) + for isIdent(sc.Peek(), 1) { + writeChar(buf, sc.Next()) } return nil } -// scanChinese -func (sc *Scanner) scanChinese(ch int, buf *bytes.Buffer) error { - writeRune(buf, rune(ch)) - sc.scanIdentAndChinese(buf) - return nil -} - func (sc *Scanner) scanDecimal(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) for isDecimal(sc.Peek()) { @@ -225,7 +206,7 @@ func (sc *Scanner) scanString(quote int, buf *bytes.Buffer) error { return err } } else { - writeRune(buf, rune(ch)) + writeChar(buf, ch) } ch = sc.Next() } @@ -305,7 +286,7 @@ func (sc *Scanner) scanMultilineString(ch int, buf *bytes.Buffer) error { buf.WriteString(strings.Repeat("=", count2)) continue } - writeRune(buf, rune(ch)) + writeChar(buf, ch) ch = sc.Next() } @@ -343,16 +324,6 @@ redo: tok.Pos = sc.Pos switch { - case isChinese(rune(ch)): - tok.Type = TIdent - err = sc.scanChinese(ch, buf) - tok.Str = buf.String() - if err != nil { - goto finally - } - if typ, ok := reservedWords[tok.Str]; ok { - tok.Type = typ - } case isIdent(ch, 0): tok.Type = TIdent err = sc.scanIdent(ch, buf) From 0b6ecafa760e3c212f13b8be184e164eaf6b673a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Mon, 2 Sep 2024 11:43:37 +0800 Subject: [PATCH 6/8] join --- parse/lexer.go | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index ff137512..02339345 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -39,10 +39,7 @@ func writeChar(buf *bytes.Buffer, c int) { func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' } func isIdent(ch int, pos int) bool { - if isChinese(rune(ch)) { - return true - } - return ch == '_' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z' || isDecimal(ch) && pos > 0 + return isChinese(rune(ch)) || ch == '_' || 'A' <= ch && ch <= 'Z' || 'a' <= ch && ch <= 'z' || isDecimal(ch) && pos > 0 } // isChinese From da185be70d35deb802dfbf857a7bf106f5ff53a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Mon, 2 Sep 2024 19:53:12 +0800 Subject: [PATCH 7/8] remove print log --- parse/lexer.go | 1 - 1 file changed, 1 deletion(-) diff --git a/parse/lexer.go b/parse/lexer.go index 02339345..a0a59c12 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -144,7 +144,6 @@ func (sc *Scanner) skipComments(ch int) error { func (sc *Scanner) scanIdent(ch int, buf *bytes.Buffer) error { writeChar(buf, ch) - fmt.Println(isIdent(sc.Peek(), 1)) for isIdent(sc.Peek(), 1) { writeChar(buf, sc.Next()) } From 59f0a4d4a62cff2e7b02532a1e058370cbdc8116 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=A8=E6=B4=81?= Date: Mon, 2 Sep 2024 19:55:31 +0800 Subject: [PATCH 8/8] inline --- parse/lexer.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/parse/lexer.go b/parse/lexer.go index a0a59c12..38658f76 100644 --- a/parse/lexer.go +++ b/parse/lexer.go @@ -32,9 +32,7 @@ func (e *Error) Error() string { } } -func writeChar(buf *bytes.Buffer, c int) { - buf.WriteRune(rune(c)) -} +func writeChar(buf *bytes.Buffer, c int) { buf.WriteRune(rune(c)) } func isDecimal(ch int) bool { return '0' <= ch && ch <= '9' } @@ -43,9 +41,7 @@ func isIdent(ch int, pos int) bool { } // isChinese -func isChinese(ch rune) bool { - return unicode.Is(unicode.Han, ch) -} +func isChinese(ch rune) bool { return unicode.Is(unicode.Han, ch) } func isDigit(ch int) bool { return '0' <= ch && ch <= '9' || 'a' <= ch && ch <= 'f' || 'A' <= ch && ch <= 'F'