diff --git a/embed.fnc b/embed.fnc index 3a16e2ffbe43..e15a894a60b3 100644 --- a/embed.fnc +++ b/embed.fnc @@ -6194,7 +6194,7 @@ RS |char * |scan_heredoc |NN char *s S |char * |scan_ident |NN char *s \ |SPTR char *dest \ |EPTR char *dest_end \ - |bool chk_unary + |U32 flags RS |char * |scan_inputsymbol \ |NN char *start RS |char * |scan_pat |NN char *start \ diff --git a/proto.h b/proto.h index c2908b823fc1..80b0c279021d 100644 --- a/proto.h +++ b/proto.h @@ -9500,7 +9500,7 @@ S_scan_heredoc(pTHX_ char *s) assert(s) STATIC char * -S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary); +S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags); # define PERL_ARGS_ASSERT_SCAN_IDENT \ assert(s); assert(dest); assert(dest_end); assert(dest < dest_end) diff --git a/regen/unicode_constants.pl b/regen/unicode_constants.pl index 538a397edce5..8afc21703464 100644 --- a/regen/unicode_constants.pl +++ b/regen/unicode_constants.pl @@ -1039,6 +1039,7 @@ END U+10FFFF string MAX_UNICODE +SHY native NBSP native NBSP string diff --git a/t/comp/parser.t b/t/comp/parser.t index 44d8fef2ced1..658811374f65 100644 --- a/t/comp/parser.t +++ b/t/comp/parser.t @@ -8,7 +8,7 @@ BEGIN { chdir 't' if -d 't'; } -print "1..192\n"; +print "1..193\n"; sub failed { my ($got, $expected, $name) = @_; @@ -673,6 +673,18 @@ is $@, "", 'substr keys assignment'; is ($@, "", "Handles all numeric package component after ::"); } +{ + my $expected = "this is the way the identifier ends; not with a bang"; + my $result; + eval "use utf8; my \$e\x{1df8}claire = '$expected'; \$result = \${e\x{1df8}claire}"; + if ($@) { + failed($@, "no error", "Didn't crash"); + } + else { + is ($result, $expected, "Parser can handle a continuation as 2nd char"); + } +} + # Add new tests HERE (above this line) # bug #74022: Loop on characters in \p{OtherIDContinue} diff --git a/toke.c b/toke.c index f9a20bdd9250..aef854a9e641 100644 --- a/toke.c +++ b/toke.c @@ -180,6 +180,8 @@ static const char ident_var_zero_multi_digit[] = "Numeric variables with more th #define IDFIRST_ONLY (1 << 3) #define STOP_AT_FIRST_NON_DIGIT (1 << 4) #define CHECK_ONLY (1 << 5) +#define CHECK_UNARY (1 << 6) +#define IDCONT_first_OK (1 << 7) #ifdef DEBUGGING static const char* const lex_state_names[] = { @@ -4741,12 +4743,13 @@ S_intuit_more(pTHX_ char *s, char *e, * changed since the code was first added */ char tmpbuf[ C_ARRAY_LENGTH(PL_tokenbuf) * 4 ]; - /* khw: scan_ident shouldn't be used as-is. It has side - * effects and can end up calling this function recursively. - * - * khw: If what follows can't be an identifier, say it is too - * long or is $001, then it must be a charclass */ - scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), FALSE); + if (! scan_ident(s, tmpbuf, C_ARRAY_END(tmpbuf), CHECK_ONLY)) + { + /* An illegal identifier means this can't be a subscript; + * it's an error or it could be a charclass */ + return false; + } + len = strlen(tmpbuf); /* khw: This only looks at global variables; lexicals came @@ -5606,8 +5609,7 @@ yyl_dollar(pTHX_ char *s) || memCHRs("{$:+-@", s[2]))) { PL_tokenbuf[0] = '@'; - s = scan_ident(s + 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), - FALSE); + s = scan_ident(s + 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Array length", s, POP_OLDBUFPTR); if (!PL_tokenbuf[1]) PREREF(DOLSHARP); @@ -5617,7 +5619,7 @@ yyl_dollar(pTHX_ char *s) } PL_tokenbuf[0] = '$'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Scalar", s, POP_OLDBUFPTR); if (!PL_tokenbuf[1]) { if (s == PL_bufend) @@ -6282,7 +6284,7 @@ yyl_star(pTHX_ char *s) POSTDEREF(PERLY_STAR); if (PL_expect != XOPERATOR) { - s = scan_ident(s, PL_tokenbuf, C_ARRAY_END(PL_tokenbuf), TRUE); + s = scan_ident(s, PL_tokenbuf, C_ARRAY_END(PL_tokenbuf), CHECK_UNARY); PL_expect = XOPERATOR; force_ident(PL_tokenbuf, PERLY_STAR); if (!*PL_tokenbuf) @@ -6330,7 +6332,7 @@ yyl_percent(pTHX_ char *s) POSTDEREF(PERLY_PERCENT_SIGN); PL_tokenbuf[0] = '%'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); pl_yylval.ival = 0; if (!PL_tokenbuf[1]) { PREREF(PERLY_PERCENT_SIGN); @@ -6867,7 +6869,8 @@ yyl_ampersand(pTHX_ char *s) } PL_tokenbuf[0] = '&'; - s = scan_ident(s - 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), TRUE); + s = scan_ident(s - 1, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), + CHECK_UNARY); pl_yylval.ival = (OPpENTERSUB_AMPER<<8); if (PL_tokenbuf[1]) @@ -6952,7 +6955,7 @@ yyl_snail(pTHX_ char *s) if (PL_expect == XPOSTDEREF) POSTDEREF(PERLY_SNAIL); PL_tokenbuf[0] = '@'; - s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), FALSE); + s = scan_ident(s, PL_tokenbuf + 1, C_ARRAY_END(PL_tokenbuf), 0); S_warn_expect_operator(aTHX_ "Array", s, POP_OLDBUFPTR); pl_yylval.ival = 0; if (!PL_tokenbuf[1]) { @@ -10596,6 +10599,11 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end, * in things like Foo::$bar */ const bool check_dollar = flags & CHECK_DOLLAR; + /* There is a use case for calling this function in the middle of having + * parsed a portion of an identifier. Therefore it should be able to + * accept the first character being an IDCont, and not necessarily an + * IDFIRST. The 'IDCONT_first_OK' flag is used to indicate this */ + while (s < s_end) { /* For non-UTF8, variables that match ASCII \w are a superset of @@ -10603,8 +10611,11 @@ S_parse_ident(pTHX_ const char *s, const char * const s_end, * Unicode definition only when UTF-8 is in effect. We have to check * for the subset before checking for the superset. */ Size_t advance; - if ( (advance = isIDFIRST_lazy_if_safe(s, s_end, is_utf8)) - && (is_utf8 || idfirst_only)) + if ( (is_utf8 || idfirst_only) + && (advance = (flags & IDCONT_first_OK) + ? isIDCONT_lazy_if_safe((U8 *) s, (U8 *) s_end, + is_utf8) + : isIDFIRST_lazy_if_safe(s, s_end, is_utf8))) { const char *this_start = s; s += advance; @@ -10749,43 +10760,71 @@ Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STR return s; } -/* scan s and extract an identifier ($var) from it if possible - * into dest. - * XXX: This function has subtle implications on parsing, and - * changing how it behaves can cause a variable to change from - * being a run time rv2sv call or a compile time binding to a - * specific variable name. +/* scan 's' and extract an identifier ($var) from it into 'dest' if possible. + * + * Unlike S_parse_ident which looks for the more usual types of identifiers + * (and which this calls if needed), this looks for every possible identifier + * type, such as punctuation ones. + * + * It returns a pointer into the input buffer pointing to just after all the + * bytes this function consumed; or croaks if an invalid identifier is found. + * + * XXX: This function normally has subtle implications on parsing, and + * changing how it behaves can cause a variable to change from being a run + * time rv2sv call or a compile time binding to a specific variable name. + * + * However, it can be called with the CHECK_ONLY flag which keeps it from + * making any changes besides populating the memory 'dest' points to. If the + * identifier is illegal, it returns NULL instead of croaking. + * + * And use the CHECK_UNARY flag to cause this to look for ambiguities with + * unary operators. This is silently overriden if CHECK_ONLY is also + * specified. */ STATIC char * -S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) +S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, U32 flags) { PERL_ARGS_ASSERT_SCAN_IDENT; I32 herelines = PL_parser->herelines; - SSize_t bracket = -1; + +#define NO_BRACE -1 + SSize_t bracket = NO_BRACE; + char funny = *s++; char *d = dest; char * const e = dest_end - 3; /* two-character token, ending NUL */ bool is_utf8 = cBOOL(UTF); line_t orig_copline = 0, tmp_copline = 0; + /* Leave the flag in its position, so can pass this on without needing to + * anything extra */ + const U32 check_only = flags & CHECK_ONLY; + + const bool chk_unary = ! check_only && (flags & CHECK_UNARY); + if (isSPACE(*s) || !*s) s = skipspace(s); /* See if it is a "normal" identifier */ s = parse_ident(s, PL_bufend, &d, e, is_utf8, - (ALLOW_PACKAGE | STOP_AT_FIRST_NON_DIGIT)); - d = dest; + (ALLOW_PACKAGE | STOP_AT_FIRST_NON_DIGIT | check_only)); + if (s == NULL) { + return NULL; + } + d = dest; if (*d) { + /* Here parse_ident() found a digit variable or an identifier (anything valid as a bareword), so job done and return. */ - if (PL_lex_state != LEX_NORMAL) + if (! check_only && PL_lex_state != LEX_NORMAL) PL_lex_state = LEX_INTERPENDMAYBE; return s; } - /* Here, it is not a run-of-the-mill identifier name */ + /* Here, it is not a run-of-the-mill identifier name; maybe not an + * identifier at all. Note *d is a NUL */ if (*s == '$' && s[1] && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8) @@ -10800,9 +10839,12 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) return s; } - /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */ + /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} + * Skip to the first non-space past the brace */ if (*s == '{') { + /* 'bracket' becomes the offset from the beginning of this chunk */ bracket = s - SvPVX(PL_linestr); + s++; orig_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { @@ -10810,11 +10852,12 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) } } - /* Extract the first character of the variable name from 's' and - * copy it, null terminated into 'd'. Note that this does not - * involve checking for just IDFIRST characters, as it allows the - * '^' for ${^FOO} type variable names, and it allows all the - * characters that are legal in a single character variable name. + /* Here, 's' points to the next "interesting" character. + * Extract the first character of the potential variable name from 's' and + * copy it, NUL terminated, into 'd'. Note that this does not involve + * checking for just IDFIRST characters, as it allows the '^' for ${^FOO} + * type variable names, and it allows all the characters that are legal in + * a single character variable name. * * The legal ones are any of: * a) all ASCII characters except: @@ -10829,82 +10872,126 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) * Because all ASCII characters have the same representation whether * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and * '{' without knowing if is UTF-8 or not. */ - - if ( (s <= PL_bufend - ((is_utf8) ? UTF8SKIP(s) : 1)) + STRLEN advance = 1; + if ( s < PL_bufend && ( isGRAPH_A(*s) - || (is_utf8 - ? isIDFIRST_utf8_safe(s, PL_bufend) - : ( isGRAPH_L1(*s) - && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD)))))) + || (is_utf8 ? (advance = isIDFIRST_utf8_safe(s, PL_bufend)) + : (isGRAPH_L1(*s) && LIKELY((U8) *s != SHY_NATIVE))))) { - if (is_utf8) { - const STRLEN skip = UTF8SKIP(s); - STRLEN i; - d[skip] = '\0'; - for ( i = 0; i < skip; i++ ) - d[i] = *s++; - } - else { - *d = *s++; - d[1] = '\0'; - } + STRLEN i; + d[advance] = '\0'; + for ( i = 0; i < advance; i++ ) + d[i] = *s++; } - /* special case to handle ${10}, ${11} the same way we handle $1 etc */ + /* 'd' has not been advanced, but if 's' pointed to a legal identifier + * character, it has been advanced to the next character, and the + * character it previously pointed to has been copied to where 'd' + * continues to point to. + * + * If that copied character is a digit, it means we have something like + * ${10}, ${1547}, etc. Handle those the same way we handle $1, etc */ if (isDIGIT(*d)) { + assert(bracket != NO_BRACE); s = parse_ident(s - 1, PL_bufend, &d, e, is_utf8, - STOP_AT_FIRST_NON_DIGIT); + STOP_AT_FIRST_NON_DIGIT | check_only); + if (s == NULL) { + return NULL; + } /* The code below is expecting d to point to the final digit */ d--; } - - /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */ - else if (*d == '^' && *s && isCONTROLVAR(*s)) { + else /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */ + if (*d == '^' && *s && isCONTROLVAR(*s)) { *d = toCTRL(*s); s++; } - /* Warn about ambiguous code after unary operators if {...} notation isn't - used. There's no difference in ambiguity; it's merely a heuristic - about when not to warn. */ - else if (chk_unary && bracket == -1) + else /* Warn about ambiguous code after unary operators if {...} notation + isn't used. There's no difference in ambiguity; it's merely a + heuristic about when not to warn. */ + if (chk_unary && bracket == -1) { check_unary(); + } - if (bracket != -1) { - bool skip; - char *s2; - /* If we were processing {...} notation then... */ - if ( isIDFIRST_lazy_if_safe(d, e, is_utf8) - || ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ - && isWORDCHAR(*s)) - ) { - /* note we have to check for a normal identifier first, - * as it handles utf8 symbols, and only after that has - * been ruled out can we look at the caret words */ - Size_t advance; - if ((advance = isIDFIRST_lazy_if_safe(d, e, is_utf8) )) { - /* if it starts as a valid identifier, assume that it is one. - (the later check for } being at the expected point will trap - cases where this doesn't pan out.) */ + /* Here, 's' points to the next "interesting" character to be parsed. And + * *d points to the first byte of the final so-far parsed and copied + * character. This is one of four things: + * 1) The only byte of the final character of an all-digit numeric + * variable inside braces. e.g. if the input is ${ 123 }, '123' has + * been copied to 'dest', and 'd' points to the '3'. We don't know + * yet if there is a closing brace. + * 2) A control character + * 3) The first (or only) byte of some other identifier + * 4) *d is NUL for anything else. + */ + + if (bracket == NO_BRACE) { + if ( ! check_only + && PL_lex_state == LEX_INTERPNORMAL + && ! PL_lex_brackets + && ! intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) + PL_lex_state = LEX_INTERPEND; + } + else { /* Found a '{' */ + + /* Handle the interior of braces. First look to see if the character + * pointed to by 'd' is legal as the start of an identifier. */ + Size_t advance = isIDFIRST_lazy_if_safe(d, e, is_utf8); + + /* If it isn't a normal identifier, it could be a control-character + * one. Those have to be followed by a \w character. */ + if (advance || ( ! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ + && isWORDCHAR(*s))) + { + /* Prefer a normal identifier, as UTF-8 strings could erroneously + * be conflated with a control character identifier. */ + if (advance) { + + /* Now parse the normal identifier. But note, we already have + * parsed and copied the first character of it. That means we + * are jumping into the middle; so tell that to parse_ident. + * */ d += advance; s = parse_ident(s, PL_bufend, &d, e, is_utf8, - (ALLOW_PACKAGE | CHECK_DOLLAR)); + ( ALLOW_PACKAGE + | CHECK_DOLLAR + | IDCONT_first_OK + | check_only)); } else { /* caret word: ${^Foo} ${^CAPTURE[0]} */ + + /* Now parse the control character identifier. Again, we have + * already copied the first character. This routine is + * sufficiently chummy with parse_ident to know that when we + * say the string isn't UTF-8, it will do the right thing in + * looking only for ASCII \w characters as identifier + * continuations */ d++; - while (isWORDCHAR(*s) && d < e) { - *d++ = *s++; - } - if (d >= e) - croak("%s", ident_too_long); - *d = '\0'; + s = parse_ident(s, PL_bufend, &d, e, + false, /* Don't allow UTF-8 */ + IDCONT_first_OK); } + + if (s == NULL) { /* Can't be NULL unless is check_only */ + return NULL; + } + tmp_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { s = skipspace(s); } - if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) { + + if (*s == '[' || (*s == '{' && strNE(dest, "sub"))) { + + /* In this branch, 's' is not changed further. If only + * checking validity, return now before any state changes */ + if (check_only) { + return s; + } + /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */ + if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) { const char * const brack = (const char *) @@ -10926,6 +11013,9 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) if ( !tmp_copline ) tmp_copline = CopLINE(PL_curcop); + + char *s2; + bool skip; if ((skip = s < PL_bufend && isSPACE(*s))) { /* Avoid incrementing line numbers or resetting PL_linestart, in case we have to back up. */ @@ -10936,13 +11026,35 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) else s2 = s; - /* Expect to find a closing } after consuming any trailing whitespace. - */ - if (*s2 == '}') { - /* Now increment line numbers if applicable. */ + /* Expect to find a closing '}' after consuming any trailing + * whitespace. */ + if (*s2 != '}') { + /* Didn't find the closing '}' at the point we expected, so + * restore the state such that the next thing to process is the + * opening '{' and let the parser handle it */ + s = SvPVX(PL_linestr) + bracket; + + /* The final change to 's' has just been made. If only validity + * checking, return before making any state changes */ + if (check_only) { + return s; + } + CopLINE_set(PL_curcop, orig_copline); + PL_parser->herelines = herelines; + *dest = '\0'; + PL_parser->sub_no_recover = TRUE; + } + else { /* Now increment line numbers if applicable. */ if (skip) s = skipspace(s); s++; + + /* The final change to 's' has just been made. If only validity + * checking, return before making any state changes */ + if (check_only) { + return s; + } + if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) { PL_lex_state = LEX_INTERPEND; PL_expect = XREF; @@ -10967,20 +11079,8 @@ S_scan_ident(pTHX_ char *s, char *dest, char *dest_end, bool chk_unary) } } } - else { - /* Didn't find the closing } at the point we expected, so restore - state such that the next thing to process is the opening { and */ - s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */ - CopLINE_set(PL_curcop, orig_copline); - PL_parser->herelines = herelines; - *dest = '\0'; - PL_parser->sub_no_recover = TRUE; - } } - else if ( PL_lex_state == LEX_INTERPNORMAL - && !PL_lex_brackets - && !intuit_more(s, PL_bufend, FROM_IDENT, NULL, 0)) - PL_lex_state = LEX_INTERPEND; + return s; } diff --git a/unicode_constants.h b/unicode_constants.h index 78b401683265..574678d9e10f 100644 --- a/unicode_constants.h +++ b/unicode_constants.h @@ -76,6 +76,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xF4\x8F\xBF\xBF" /* U+10FFFF */ +# define SHY_NATIVE 0xAD /* U+00AD */ # define NBSP_NATIVE 0xA0 /* U+00A0 */ # define NBSP_UTF8 "\xC2\xA0" /* U+00A0 */ @@ -142,6 +143,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xEE\x42\x73\x73\x73" /* U+10FFFF */ +# define SHY_NATIVE 0xCA /* U+00AD */ # define NBSP_NATIVE 0x41 /* U+00A0 */ # define NBSP_UTF8 "\x80\x41" /* U+00A0 */ @@ -208,6 +210,7 @@ bytes. # define MAX_UNICODE_UTF8 "\xEE\x42\x72\x72\x72" /* U+10FFFF */ +# define SHY_NATIVE 0xCA /* U+00AD */ # define NBSP_NATIVE 0x41 /* U+00A0 */ # define NBSP_UTF8 "\x78\x41" /* U+00A0 */