Source code

Revision control

Copy as Markdown

Other Tools

/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
// A CSS Lexer. This file is a bit unusual -- it is a more or less
// direct translation of layout/style/nsCSSScanner.cpp and
// layout/style/CSSLexer.cpp into JS. This implemented the
// CSSLexer.webidl interface, and the intent is to try to keep it in
// sync with changes to the platform CSS lexer. Due to this goal,
// this file violates some naming conventions and consequently locally
// disables some eslint rules.
/* eslint-disable camelcase, mozilla/no-aArgs, no-else-return, complexity */
"use strict";
// White space of any kind. No value fields are used. Note that
// comments do *not* count as white space; comments separate tokens
// but are not themselves tokens.
const eCSSToken_Whitespace = "whitespace"; //
// A comment.
const eCSSToken_Comment = "comment"; // /*...*/
// Identifier-like tokens. mIdent is the text of the identifier.
// The difference between ID and Hash is: if the text after the #
// would have been a valid Ident if the # hadn't been there, the
// scanner produces an ID token. Otherwise it produces a Hash token.
// (This distinction is required by css3-selectors.)
const eCSSToken_Ident = "ident"; // word
const eCSSToken_Function = "function"; // word(
const eCSSToken_AtKeyword = "at"; // @word
const eCSSToken_ID = "id"; // #word
const eCSSToken_Hash = "hash"; // #0word
// Numeric tokens. mNumber is the floating-point value of the
// number, and mHasSign indicates whether there was an explicit sign
// (+ or -) in front of the number. If mIntegerValid is true, the
// number had the lexical form of an integer, and mInteger is its
// integer value. Lexically integer values outside the range of a
// 32-bit signed number are clamped to the maximum values; mNumber
// will indicate a 'truer' value in that case. Percentage tokens
// are always considered not to be integers, even if their numeric
// value is integral (100% => mNumber = 1.0). For Dimension
// tokens, mIdent holds the text of the unit.
const eCSSToken_Number = "number"; // 1 -5 +2e3 3.14159 7.297352e-3
const eCSSToken_Dimension = "dimension"; // 24px 8.5in
const eCSSToken_Percentage = "percentage"; // 85% 1280.4%
// String-like tokens. In all cases, mIdent holds the text
// belonging to the string, and mSymbol holds the delimiter
// character, which may be ', ", or zero (only for unquoted URLs).
// Bad_String and Bad_URL tokens are emitted when the closing
// delimiter or parenthesis was missing.
const eCSSToken_String = "string"; // 'foo bar' "foo bar"
const eCSSToken_Bad_String = "bad_string"; // 'foo bar
const eCSSToken_URL = "url"; // url(foobar) url("foo bar")
const eCSSToken_Bad_URL = "bad_url"; // url(foo
// Any one-character symbol. mSymbol holds the character.
const eCSSToken_Symbol = "symbol"; // . ; { } ! *
// Match operators. These are single tokens rather than pairs of
// Symbol tokens because css3-selectors forbids the presence of
// comments between the two characters. No value fields are used;
// the token type indicates which operator.
const eCSSToken_Includes = "includes"; // ~=
const eCSSToken_Dashmatch = "dashmatch"; // |=
const eCSSToken_Beginsmatch = "beginsmatch"; // ^=
const eCSSToken_Endsmatch = "endsmatch"; // $=
const eCSSToken_Containsmatch = "containsmatch"; // *=
// Unicode-range token: currently used only in @font-face.
// The lexical rule for this token includes several forms that are
// semantically invalid. Therefore, mIdent always holds the
// complete original text of the token (so we can print it
// accurately in diagnostics), and mIntegerValid is true iff the
// token is semantically valid. In that case, mInteger holds the
// lowest value included in the range, and mInteger2 holds the
// highest value included in the range.
const eCSSToken_URange = "urange"; // U+007e U+01?? U+2000-206F
// HTML comment delimiters, ignored as a unit when they appear at
// the top level of a style sheet, for compatibility with websites
// written for compatibility with pre-CSS browsers. This token type
// subsumes the css2.1 CDO and CDC tokens, which are always treated
// the same by the parser. mIdent holds the text of the token, for
// diagnostics.
const eCSSToken_HTMLComment = "htmlcomment"; // <!-- -->
const eEOFCharacters_None = 0x0000;
// to handle \<EOF> inside strings
const eEOFCharacters_DropBackslash = 0x0001;
// to handle \<EOF> outside strings
const eEOFCharacters_ReplacementChar = 0x0002;
// to close comments
const eEOFCharacters_Asterisk = 0x0004;
const eEOFCharacters_Slash = 0x0008;
// to close double-quoted strings
const eEOFCharacters_DoubleQuote = 0x0010;
// to close single-quoted strings
const eEOFCharacters_SingleQuote = 0x0020;
// to close URLs
const eEOFCharacters_CloseParen = 0x0040;
// Bridge the char/string divide.
const APOSTROPHE = "'".charCodeAt(0);
const ASTERISK = "*".charCodeAt(0);
const CARRIAGE_RETURN = "\r".charCodeAt(0);
const CIRCUMFLEX_ACCENT = "^".charCodeAt(0);
const COMMERCIAL_AT = "@".charCodeAt(0);
const DIGIT_NINE = "9".charCodeAt(0);
const DIGIT_ZERO = "0".charCodeAt(0);
const DOLLAR_SIGN = "$".charCodeAt(0);
const EQUALS_SIGN = "=".charCodeAt(0);
const EXCLAMATION_MARK = "!".charCodeAt(0);
const FULL_STOP = ".".charCodeAt(0);
const GREATER_THAN_SIGN = ">".charCodeAt(0);
const HYPHEN_MINUS = "-".charCodeAt(0);
const LATIN_CAPITAL_LETTER_E = "E".charCodeAt(0);
const LATIN_CAPITAL_LETTER_U = "U".charCodeAt(0);
const LATIN_SMALL_LETTER_E = "e".charCodeAt(0);
const LATIN_SMALL_LETTER_U = "u".charCodeAt(0);
const LEFT_PARENTHESIS = "(".charCodeAt(0);
const LESS_THAN_SIGN = "<".charCodeAt(0);
const LINE_FEED = "\n".charCodeAt(0);
const NUMBER_SIGN = "#".charCodeAt(0);
const PERCENT_SIGN = "%".charCodeAt(0);
const PLUS_SIGN = "+".charCodeAt(0);
const QUESTION_MARK = "?".charCodeAt(0);
const QUOTATION_MARK = '"'.charCodeAt(0);
const REVERSE_SOLIDUS = "\\".charCodeAt(0);
const RIGHT_PARENTHESIS = ")".charCodeAt(0);
const SOLIDUS = "/".charCodeAt(0);
const TILDE = "~".charCodeAt(0);
const VERTICAL_LINE = "|".charCodeAt(0);
const UCS2_REPLACEMENT_CHAR = 0xfffd;
const kImpliedEOFCharacters = [
UCS2_REPLACEMENT_CHAR,
ASTERISK,
SOLIDUS,
QUOTATION_MARK,
APOSTROPHE,
RIGHT_PARENTHESIS,
0,
];
//
const ARGS_LENGTH_MAX = 500 * 1000;
/**
* Several methods in this helper can reach the 500000 limit for arguments in
* Firefox, see Bug 1414361.
*
* This will apply the provided method, on the provided scope with an array of
* arguments which can exceed the 500k limit supported by Firefox.
*
* In practice, the arguments array will be split in several chunks of 500k
* items maximum and each chunk will be applied separately.
*
* !! Note that if you are expecting to use the return value of the method, here
* we will return an array of each return value for each chunk. It will be up to
* the consumer to decide how to combine the results into a meaningful final
* result !!
*
* @param {Function} method
* The method to apply.
* @param {*} scope
* The scope ("this") to use when applying the method.
* @param {Array} args
* The array of arguments to apply.
*
* @returns {Array}
* The array of return values, one item for each chunk that had to be
* created.
*/
function safeApply(method, scope, args) {
let i = 0;
const res = [];
const length = args.length;
while (i < length) {
const _start = i;
i += ARGS_LENGTH_MAX;
res.push(method.apply(scope, args.slice(_start, i)));
}
return res;
}
/**
* Ensure that the character is valid. If it is valid, return it;
* otherwise, return the replacement character.
*
* @param {Number} c the character to check
* @return {Number} the character or its replacement
*/
function ensureValidChar(c) {
if (c >= 0x00110000 || (c & 0xfff800) == 0xd800) {
// Out of range or a surrogate.
return UCS2_REPLACEMENT_CHAR;
}
return c;
}
/**
* Turn a string into an array of character codes.
*
* @param {String} str the input string
* @return {Array} an array of character codes, one per character in
* the input string.
*/
function stringToCodes(str) {
// This is a hot path, and using a simple for loop is faster than any other mean (e.g.
// Array#map ).
const charCodes = [];
for (let i = 0; i < str.length; i++) {
charCodes.push(str.charCodeAt(i));
}
return charCodes;
}
const IS_HEX_DIGIT = 0x01;
const IS_IDSTART = 0x02;
const IS_IDCHAR = 0x04;
const IS_URL_CHAR = 0x08;
const IS_HSPACE = 0x10;
const IS_VSPACE = 0x20;
const IS_SPACE = IS_HSPACE | IS_VSPACE;
const IS_STRING = 0x40;
const H = IS_HSPACE;
const V = IS_VSPACE;
const I = IS_IDCHAR;
const J = IS_IDSTART;
const U = IS_URL_CHAR;
const S = IS_STRING;
const X = IS_HEX_DIGIT;
const SH = S | H;
const SU = S | U;
const SUI = S | U | I;
const SUIJ = S | U | I | J;
const SUIX = S | U | I | X;
const SUIJX = S | U | I | J | X;
/* eslint-disable indent, indent-legacy, no-multi-spaces, comma-spacing, spaced-comment */
const gLexTable = [
// 00 01 02 03 04 05 06 07
0,
S,
S,
S,
S,
S,
S,
S,
// 08 TAB LF 0B FF CR 0E 0F
S,
SH,
V,
S,
V,
V,
S,
S,
// 10 11 12 13 14 15 16 17
S,
S,
S,
S,
S,
S,
S,
S,
// 18 19 1A 1B 1C 1D 1E 1F
S,
S,
S,
S,
S,
S,
S,
S,
//SPC ! " # $ % & '
SH,
SU,
0,
SU,
SU,
SU,
SU,
0,
// ( ) * + , - . /
S,
S,
SU,
SU,
SU,
SUI,
SU,
SU,
// 0 1 2 3 4 5 6 7
SUIX,
SUIX,
SUIX,
SUIX,
SUIX,
SUIX,
SUIX,
SUIX,
// 8 9 : ; < = > ?
SUIX,
SUIX,
SU,
SU,
SU,
SU,
SU,
SU,
// @ A B C D E F G
SU,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJ,
// H I J K L M N O
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
// P Q R S T U V W
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
// X Y Z [ \ ] ^ _
SUIJ,
SUIJ,
SUIJ,
SU,
J,
SU,
SU,
SUIJ,
// ` a b c d e f g
SU,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJX,
SUIJ,
// h i j k l m n o
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
// p q r s t u v w
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
SUIJ,
// x y z { | } ~ 7F
SUIJ,
SUIJ,
SUIJ,
SU,
SU,
SU,
SU,
S,
];
/* eslint-enable indent, indent-legacy, no-multi-spaces, comma-spacing, spaced-comment */
/**
* True if 'ch' is in character class 'cls', which should be one of
* the constants above or some combination of them. All characters
* above U+007F are considered to be in 'cls'. EOF is never in 'cls'.
*/
function IsOpenCharClass(ch, cls) {
return ch >= 0 && (ch >= 128 || (gLexTable[ch] & cls) != 0);
}
/**
* True if 'ch' is in character class 'cls', which should be one of
* the constants above or some combination of them. No characters
* above U+007F are considered to be in 'cls'. EOF is never in 'cls'.
*/
function IsClosedCharClass(ch, cls) {
return ch >= 0 && ch < 128 && (gLexTable[ch] & cls) != 0;
}
/**
* True if 'ch' is CSS whitespace, i.e. any of the ASCII characters
* TAB, LF, FF, CR, or SPC.
*/
function IsWhitespace(ch) {
return IsClosedCharClass(ch, IS_SPACE);
}
/**
* True if 'ch' is horizontal whitespace, i.e. TAB or SPC.
*/
function IsHorzSpace(ch) {
return IsClosedCharClass(ch, IS_HSPACE);
}
/**
* True if 'ch' is vertical whitespace, i.e. LF, FF, or CR. Vertical
* whitespace requires special handling when consumed, see AdvanceLine.
*/
function IsVertSpace(ch) {
return IsClosedCharClass(ch, IS_VSPACE);
}
/**
* True if 'ch' is a character that can appear in the middle of an identifier.
* This includes U+0000 since it is handled as U+FFFD, but for purposes of
* GatherText it should not be included in IsOpenCharClass.
*/
function IsIdentChar(ch) {
return IsOpenCharClass(ch, IS_IDCHAR) || ch == 0;
}
/**
* True if 'ch' is a character that by itself begins an identifier.
* This includes U+0000 since it is handled as U+FFFD, but for purposes of
* GatherText it should not be included in IsOpenCharClass.
* (This is a subset of IsIdentChar.)
*/
function IsIdentStart(ch) {
return IsOpenCharClass(ch, IS_IDSTART) || ch == 0;
}
/**
* True if the two-character sequence aFirstChar+aSecondChar begins an
* identifier.
*/
function StartsIdent(aFirstChar, aSecondChar) {
return (
IsIdentStart(aFirstChar) ||
(aFirstChar == HYPHEN_MINUS &&
(aSecondChar == HYPHEN_MINUS || IsIdentStart(aSecondChar)))
);
}
/**
* True if 'ch' is a decimal digit.
*/
function IsDigit(ch) {
return ch >= DIGIT_ZERO && ch <= DIGIT_NINE;
}
/**
* True if 'ch' is a hexadecimal digit.
*/
function IsHexDigit(ch) {
return IsClosedCharClass(ch, IS_HEX_DIGIT);
}
/**
* Assuming that 'ch' is a decimal digit, return its numeric value.
*/
function DecimalDigitValue(ch) {
return ch - DIGIT_ZERO;
}
/**
* Assuming that 'ch' is a hexadecimal digit, return its numeric value.
*/
function HexDigitValue(ch) {
if (IsDigit(ch)) {
return DecimalDigitValue(ch);
} else {
// Note: c&7 just keeps the low three bits which causes
// upper and lower case alphabetics to both yield their
// "relative to 10" value for computing the hex value.
return (ch & 0x7) + 9;
}
}
/**
* If 'ch' can be the first character of a two-character match operator
* token, return the token type code for that token, otherwise return
* eCSSToken_Symbol to indicate that it can't.
*/
function MatchOperatorType(ch) {
switch (ch) {
case TILDE:
return eCSSToken_Includes;
case VERTICAL_LINE:
return eCSSToken_Dashmatch;
case CIRCUMFLEX_ACCENT:
return eCSSToken_Beginsmatch;
case DOLLAR_SIGN:
return eCSSToken_Endsmatch;
case ASTERISK:
return eCSSToken_Containsmatch;
default:
return eCSSToken_Symbol;
}
}
function Scanner(buffer) {
this.mBuffer = buffer || "";
this.mOffset = 0;
this.mCount = this.mBuffer.length;
this.mLineNumber = 1;
this.mLineOffset = 0;
this.mTokenLineOffset = 0;
this.mTokenOffset = 0;
this.mTokenLineNumber = 1;
this.mEOFCharacters = eEOFCharacters_None;
}
Scanner.prototype = {
/**
* The line number of the most recently returned token. Line
* numbers are 0-based.
*/
get lineNumber() {
return this.mTokenLineNumber - 1;
},
/**
* The column number of the most recently returned token. Column
* numbers are 0-based.
*/
get columnNumber() {
return this.mTokenOffset - this.mTokenLineOffset;
},
/**
* When EOF is reached, the last token might be unterminated in some
* ways. This method takes an input string and appends the needed
* terminators. In particular:
*
* 1. If EOF occurs mid-string, this will append the correct quote.
* 2. If EOF occurs in a url token, this will append the close paren.
* 3. If EOF occurs in a comment this will append the comment closer.
*
* A trailing backslash might also have been present in the input
* string. This is handled in different ways, depending on the
* context and arguments.
*
* If preserveBackslash is true, then the existing backslash at the
* end of inputString is preserved, and a new backslash is appended.
* That is, the input |\| is transformed to |\\|, and the
* input |'\| is transformed to |'\\'|.
*
* Otherwise, preserveBackslash is false:
* If the backslash appears in a string context, then the trailing
* backslash is dropped from inputString. That is, |"\| is
* transformed to |""|.
* If the backslash appears outside of a string context, then
* U+FFFD is appended. That is, |\| is transformed to a string
* with two characters: backslash followed by U+FFFD.
*
* Passing false for preserveBackslash makes the result conform to
* the CSS Syntax specification. However, passing true may give
* somewhat more intuitive behavior.
*
* @param inputString the input string
* @param preserveBackslash how to handle trailing backslashes
* @return the input string with the termination characters appended
*/
performEOFFixup(aInputString, aPreserveBackslash) {
let result = aInputString;
let eofChars = this.mEOFCharacters;
if (
aPreserveBackslash &&
(eofChars &
(eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar)) !=
0
) {
eofChars &= ~(
eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar
);
result += "\\";
}
if (
(eofChars & eEOFCharacters_DropBackslash) != 0 &&
!!result.length &&
result.endsWith("\\")
) {
result = result.slice(0, -1);
}
const extra = [];
this.AppendImpliedEOFCharacters(eofChars, extra);
const asString = String.fromCharCode.apply(null, extra);
return result + asString;
},
/**
* Return the next token, or null at EOF.
*
* The token object is described by the following WebIDL definition:
*
* dictionary CSSToken {
* // The token type.
* CSSTokenType tokenType = "whitespace";
*
* // Offset of the first character of the token.
* unsigned long startOffset = 0;
* // Offset of the character after the final character of the token.
* // This is chosen so that the offsets can be passed to |substring|
* // to yield the exact contents of the token.
* unsigned long endOffset = 0;
*
* // If the token is a number, percentage, or dimension, this holds
* // the value. This is not present for other token types.
* double number;
* // If the token is a number, percentage, or dimension, this is true
* // iff the number had an explicit sign. This is not present for
* // other token types.
* boolean hasSign;
* // If the token is a number, percentage, or dimension, this is true
* // iff the number was specified as an integer. This is not present
* // for other token types.
* boolean isInteger;
*
* // Text associated with the token. This is not present for all
* // token types. In particular it is:
* //
* // Token type Meaning
* // ===============================
* // ident The identifier.
* // function The function name. Note that the "(" is part
* // of the token but is not present in |text|.
* // at The word.
* // id The word.
* // hash The word.
* // dimension The dimension.
* // string The string contents after escape processing.
* // bad_string Ditto.
* // url The URL after escape processing.
* // bad_url Ditto.
* // symbol The symbol text.
* DOMString text;
* };
*/
nextToken() {
const token = {};
if (!this.Next(token)) {
return null;
}
const resultToken = {};
resultToken.tokenType = token.mType;
resultToken.startOffset = this.mTokenOffset;
resultToken.endOffset = this.mOffset;
const constructText = () => {
return safeApply(String.fromCharCode, null, token.mIdent).join("");
};
switch (token.mType) {
case eCSSToken_Whitespace:
break;
case eCSSToken_Ident:
case eCSSToken_Function:
case eCSSToken_AtKeyword:
case eCSSToken_ID:
case eCSSToken_Hash:
resultToken.text = constructText();
break;
case eCSSToken_Dimension:
resultToken.text = constructText();
/* Fall through. */
case eCSSToken_Number:
case eCSSToken_Percentage:
resultToken.number = token.mNumber;
resultToken.hasSign = token.mHasSign;
resultToken.isInteger = token.mIntegerValid;
break;
case eCSSToken_String:
case eCSSToken_Bad_String:
case eCSSToken_URL:
case eCSSToken_Bad_URL:
resultToken.text = constructText();
/* Don't bother emitting the delimiter, as it is readily extracted
from the source string when needed. */
break;
case eCSSToken_Symbol:
resultToken.text = String.fromCharCode(token.mSymbol);
break;
case eCSSToken_Includes:
case eCSSToken_Dashmatch:
case eCSSToken_Beginsmatch:
case eCSSToken_Endsmatch:
case eCSSToken_Containsmatch:
case eCSSToken_URange:
break;
case eCSSToken_Comment:
case eCSSToken_HTMLComment:
/* The comment text is easily extracted from the source string,
and is rarely useful. */
break;
}
return resultToken;
},
/**
* Return the raw UTF-16 code unit at position |this.mOffset + n| within
* the read buffer. If that is beyond the end of the buffer, returns
* -1 to indicate end of input.
*/
Peek(n = 0) {
if (this.mOffset + n >= this.mCount) {
return -1;
}
return this.mBuffer.charCodeAt(this.mOffset + n);
},
/**
* Advance |this.mOffset| over |n| code units. Advance(0) is a no-op.
* If |n| is greater than the distance to end of input, will silently
* stop at the end. May not be used to advance over a line boundary;
* AdvanceLine() must be used instead.
*/
Advance(n = 1) {
if (this.mOffset + n >= this.mCount || this.mOffset + n < this.mOffset) {
this.mOffset = this.mCount;
} else {
this.mOffset += n;
}
},
/**
* Advance |this.mOffset| over a line boundary.
*/
AdvanceLine() {
// Advance over \r\n as a unit.
if (
this.mBuffer.charCodeAt(this.mOffset) == CARRIAGE_RETURN &&
this.mOffset + 1 < this.mCount &&
this.mBuffer.charCodeAt(this.mOffset + 1) == LINE_FEED
) {
this.mOffset += 2;
} else {
this.mOffset += 1;
}
// 0 is a magical line number meaning that we don't know (i.e., script)
if (this.mLineNumber != 0) {
this.mLineNumber++;
}
this.mLineOffset = this.mOffset;
},
/**
* Skip over a sequence of whitespace characters (vertical or
* horizontal) starting at the current read position.
*/
SkipWhitespace() {
for (;;) {
const ch = this.Peek();
if (!IsWhitespace(ch)) {
// EOF counts as non-whitespace
break;
}
if (IsVertSpace(ch)) {
this.AdvanceLine();
} else {
this.Advance();
}
}
},
/**
* Skip over one CSS comment starting at the current read position.
*/
SkipComment() {
this.Advance(2);
for (;;) {
let ch = this.Peek();
if (ch < 0) {
this.SetEOFCharacters(eEOFCharacters_Asterisk | eEOFCharacters_Slash);
return;
}
if (ch == ASTERISK) {
this.Advance();
ch = this.Peek();
if (ch < 0) {
this.SetEOFCharacters(eEOFCharacters_Slash);
return;
}
if (ch == SOLIDUS) {
this.Advance();
return;
}
} else if (IsVertSpace(ch)) {
this.AdvanceLine();
} else {
this.Advance();
}
}
},
/**
* If there is a valid escape sequence starting at the current read
* position, consume it, decode it, append the result to |aOutput|,
* and return true. Otherwise, consume nothing, leave |aOutput|
* unmodified, and return false. If |aInString| is true, accept the
* additional form of escape sequence allowed within string-like tokens.
*/
GatherEscape(aOutput, aInString) {
let ch = this.Peek(1);
if (ch < 0) {
// If we are in a string (or a url() containing a string), we want to drop
// the backslash on the floor. Otherwise, we want to treat it as a U+FFFD
// character.
this.Advance();
if (aInString) {
this.SetEOFCharacters(eEOFCharacters_DropBackslash);
} else {
aOutput.push(UCS2_REPLACEMENT_CHAR);
this.SetEOFCharacters(eEOFCharacters_ReplacementChar);
}
return true;
}
if (IsVertSpace(ch)) {
if (aInString) {
// In strings (and in url() containing a string), escaped
// newlines are completely removed, to allow splitting over
// multiple lines.
this.Advance();
this.AdvanceLine();
return true;
}
// Outside of strings, backslash followed by a newline is not an escape.
return false;
}
if (!IsHexDigit(ch)) {
// "Any character (except a hexadecimal digit, linefeed, carriage
// return, or form feed) can be escaped with a backslash to remove
// its special meaning." -- CSS2.1 section 4.1.3
this.Advance(2);
if (ch == 0) {
aOutput.push(UCS2_REPLACEMENT_CHAR);
} else {
aOutput.push(ch);
}
return true;
}
// "[at most six hexadecimal digits following a backslash] stand
// for the ISO 10646 character with that number, which must not be
// zero. (It is undefined in CSS 2.1 what happens if a style sheet
// does contain a character with Unicode codepoint zero.)"
// -- CSS2.1 section 4.1.3
// At this point we know we have \ followed by at least one
// hexadecimal digit, therefore the escape sequence is valid and we
// can go ahead and consume the backslash.
this.Advance();
let val = 0;
let i = 0;
do {
val = val * 16 + HexDigitValue(ch);
i++;
this.Advance();
ch = this.Peek();
} while (i < 6 && IsHexDigit(ch));
// "Interpret the hex digits as a hexadecimal number. If this
// number is zero, or is greater than the maximum allowed
// codepoint, return U+FFFD REPLACEMENT CHARACTER" -- CSS Syntax
// Level 3
if (val == 0) {
aOutput.push(UCS2_REPLACEMENT_CHAR);
} else {
aOutput.push(ensureValidChar(val));
}
// Consume exactly one whitespace character after a
// hexadecimal escape sequence.
if (IsVertSpace(ch)) {
this.AdvanceLine();
} else if (IsHorzSpace(ch)) {
this.Advance();
}
return true;
},
/**
* Consume a run of "text" beginning with the current read position,
* consisting of characters in the class |aClass| (which must be a
* suitable argument to IsOpenCharClass) plus escape sequences.
* Append the text to |aText|, after decoding escape sequences.
*
* Returns true if at least one character was appended to |aText|,
* false otherwise.
*/
GatherText(aClass, aText) {
const start = this.mOffset;
const inString = aClass == IS_STRING;
for (;;) {
// Consume runs of unescaped characters in one go.
let n = this.mOffset;
while (
n < this.mCount &&
IsOpenCharClass(this.mBuffer.charCodeAt(n), aClass)
) {
n++;
}
if (n > this.mOffset) {
const codes = stringToCodes(this.mBuffer.slice(this.mOffset, n));
safeApply(Array.prototype.push, aText, codes);
this.mOffset = n;
}
if (n == this.mCount) {
break;
}
const ch = this.Peek();
if (ch == 0) {
this.Advance();
aText.push(UCS2_REPLACEMENT_CHAR);
continue;
}
if (ch != REVERSE_SOLIDUS) {
break;
}
if (!this.GatherEscape(aText, inString)) {
break;
}
}
return this.mOffset > start;
},
/**
* Scan an Ident token. This also handles Function and URL tokens,
* both of which begin indistinguishably from an identifier. It can
* produce a Symbol token when an apparent identifier actually led
* into an invalid escape sequence.
*/
ScanIdent(aToken) {
if (!this.GatherText(IS_IDCHAR, aToken.mIdent)) {
aToken.mSymbol = this.Peek();
this.Advance();
return true;
}
if (this.Peek() != LEFT_PARENTHESIS) {
aToken.mType = eCSSToken_Ident;
return true;
}
this.Advance();
aToken.mType = eCSSToken_Function;
const asString = String.fromCharCode.apply(null, aToken.mIdent);
if (asString.toLowerCase() === "url") {
this.NextURL(aToken);
}
return true;
},
/**
* Scan an AtKeyword token. Also handles production of Symbol when
* an '@' is not followed by an identifier.
*/
ScanAtKeyword(aToken) {
// Fall back for when '@' isn't followed by an identifier.
aToken.mSymbol = COMMERCIAL_AT;
this.Advance();
const ch = this.Peek();
if (StartsIdent(ch, this.Peek(1))) {
if (this.GatherText(IS_IDCHAR, aToken.mIdent)) {
aToken.mType = eCSSToken_AtKeyword;
}
}
return true;
},
/**
* Scan a Hash token. Handles the distinction between eCSSToken_ID
* and eCSSToken_Hash, and handles production of Symbol when a '#'
* is not followed by identifier characters.
*/
ScanHash(aToken) {
// Fall back for when '#' isn't followed by identifier characters.
aToken.mSymbol = NUMBER_SIGN;
this.Advance();
const ch = this.Peek();
if (IsIdentChar(ch) || ch == REVERSE_SOLIDUS) {
const type = StartsIdent(ch, this.Peek(1))
? eCSSToken_ID
: eCSSToken_Hash;
aToken.mIdent.length = 0;
if (this.GatherText(IS_IDCHAR, aToken.mIdent)) {
aToken.mType = type;
}
}
return true;
},
/**
* Scan a Number, Percentage, or Dimension token (all of which begin
* like a Number). Can produce a Symbol when a '.' is not followed by
* digits, or when '+' or '-' are not followed by either a digit or a
* '.' and then a digit. Can also produce a HTMLComment when it
* encounters '-->'.
*/
ScanNumber(aToken) {
let c = this.Peek();
// Sign of the mantissa (-1 or 1).
const sign = c == HYPHEN_MINUS ? -1 : 1;
// Absolute value of the integer part of the mantissa. This is a double so
// we don't run into overflow issues for consumers that only care about our
// floating-point value while still being able to express the full int32_t
// range for consumers who want integers.
let intPart = 0;
// Fractional part of the mantissa. This is a double so that when
// we convert to float at the end we'll end up rounding to nearest
// float instead of truncating down (as we would if fracPart were
// a float and we just effectively lost the last several digits).
let fracPart = 0;
// Absolute value of the power of 10 that we should multiply by
// (only relevant for numbers in scientific notation). Has to be
// a signed integer, because multiplication of signed by unsigned
// converts the unsigned to signed, so if we plan to actually
// multiply by expSign...
let exponent = 0;
// Sign of the exponent.
let expSign = 1;
aToken.mHasSign = c == PLUS_SIGN || c == HYPHEN_MINUS;
if (aToken.mHasSign) {
this.Advance();
c = this.Peek();
}
let gotDot = c == FULL_STOP;
if (!gotDot) {
// Scan the integer part of the mantissa.
do {
intPart = 10 * intPart + DecimalDigitValue(c);
this.Advance();
c = this.Peek();
} while (IsDigit(c));
gotDot = c == FULL_STOP && IsDigit(this.Peek(1));
}
if (gotDot) {
// Scan the fractional part of the mantissa.
this.Advance();
c = this.Peek();
// Power of ten by which we need to divide our next digit
let divisor = 10;
do {
fracPart += DecimalDigitValue(c) / divisor;
divisor *= 10;
this.Advance();
c = this.Peek();
} while (IsDigit(c));
}
let gotE = false;
if (c == LATIN_SMALL_LETTER_E || c == LATIN_CAPITAL_LETTER_E) {
const expSignChar = this.Peek(1);
const nextChar = this.Peek(2);
if (
IsDigit(expSignChar) ||
((expSignChar == HYPHEN_MINUS || expSignChar == PLUS_SIGN) &&
IsDigit(nextChar))
) {
gotE = true;
if (expSignChar == HYPHEN_MINUS) {
expSign = -1;
}
this.Advance(); // consumes the E
if (expSignChar == HYPHEN_MINUS || expSignChar == PLUS_SIGN) {
this.Advance();
c = nextChar;
} else {
c = expSignChar;
}
do {
exponent = 10 * exponent + DecimalDigitValue(c);
this.Advance();
c = this.Peek();
} while (IsDigit(c));
}
}
let type = eCSSToken_Number;
// Set mIntegerValid for all cases (except %, below) because we need
// it for the "2n" in :nth-child(2n).
aToken.mIntegerValid = false;
// Time to reassemble our number.
// Do all the math in double precision so it's truncated only once.
let value = sign * (intPart + fracPart);
if (gotE) {
// Explicitly cast expSign*exponent to double to avoid issues with
// overloaded pow() on Windows.
value *= Math.pow(10.0, expSign * exponent);
} else if (!gotDot) {
// Clamp values outside of integer range.
if (sign > 0) {
aToken.mInteger = Math.min(intPart, Number.MAX_SAFE_INTEGER);
} else {
aToken.mInteger = Math.max(-intPart, Number.MIN_SAFE_INTEGER);
}
aToken.mIntegerValid = true;
}
const ident = aToken.mIdent;
// Check for Dimension and Percentage tokens.
if (c >= 0) {
if (StartsIdent(c, this.Peek(1))) {
if (this.GatherText(IS_IDCHAR, ident)) {
type = eCSSToken_Dimension;
}
} else if (c == PERCENT_SIGN) {
this.Advance();
type = eCSSToken_Percentage;
value = value / 100.0;
aToken.mIntegerValid = false;
}
}
aToken.mNumber = value;
aToken.mType = type;
return true;
},
/**
* Scan a string constant ('foo' or "foo"). Will always produce
* either a String or a Bad_String token; the latter occurs when the
* close quote is missing. Always returns true (for convenience in Next()).
*/
ScanString(aToken) {
const aStop = this.Peek();
aToken.mType = eCSSToken_String;
aToken.mSymbol = aStop; // Remember how it's quoted.
this.Advance();
for (;;) {
this.GatherText(IS_STRING, aToken.mIdent);
const ch = this.Peek();
if (ch == -1) {
this.AddEOFCharacters(
aStop == QUOTATION_MARK
? eEOFCharacters_DoubleQuote
: eEOFCharacters_SingleQuote
);
break; // EOF ends a string token with no error.
}
if (ch == aStop) {
this.Advance();
break;
}
// Both " and ' are excluded from IS_STRING.
if (ch == QUOTATION_MARK || ch == APOSTROPHE) {
aToken.mIdent.push(ch);
this.Advance();
continue;
}
aToken.mType = eCSSToken_Bad_String;
break;
}
return true;
},
/**
* Scan a unicode-range token. These match the regular expression
*
* u\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?
*
* However, some such tokens are "invalid". There are three valid forms:
*
* u+[0-9a-f]{x} 1 <= x <= 6
* u+[0-9a-f]{x}\?{y} 1 <= x+y <= 6
* u+[0-9a-f]{x}-[0-9a-f]{y} 1 <= x <= 6, 1 <= y <= 6
*
* All unicode-range tokens have their text recorded in mIdent; valid ones
* are also decoded into mInteger and mInteger2, and mIntegerValid is set.
* Note that this does not validate the numeric range, only the syntactic
* form.
*/
ScanURange(aResult) {
const intro1 = this.Peek();
const intro2 = this.Peek(1);
let ch = this.Peek(2);
aResult.mIdent.push(intro1);
aResult.mIdent.push(intro2);
this.Advance(2);
let valid = true;
let haveQues = false;
let low = 0;
let high = 0;
let i = 0;
do {
aResult.mIdent.push(ch);
if (IsHexDigit(ch)) {
if (haveQues) {
valid = false; // All question marks should be at the end.
}
low = low * 16 + HexDigitValue(ch);
high = high * 16 + HexDigitValue(ch);
} else {
haveQues = true;
low = low * 16 + 0x0;
high = high * 16 + 0xf;
}
i++;
this.Advance();
ch = this.Peek();
} while (i < 6 && (IsHexDigit(ch) || ch == QUESTION_MARK));
if (ch == HYPHEN_MINUS && IsHexDigit(this.Peek(1))) {
if (haveQues) {
valid = false;
}
aResult.mIdent.push(ch);
this.Advance();
ch = this.Peek();
high = 0;
i = 0;
do {
aResult.mIdent.push(ch);
high = high * 16 + HexDigitValue(ch);
i++;
this.Advance();
ch = this.Peek();
} while (i < 6 && IsHexDigit(ch));
}
aResult.mInteger = low;
aResult.mInteger2 = high;
aResult.mIntegerValid = valid;
aResult.mType = eCSSToken_URange;
return true;
},
SetEOFCharacters(aEOFCharacters) {
this.mEOFCharacters = aEOFCharacters;
},
AddEOFCharacters(aEOFCharacters) {
this.mEOFCharacters = this.mEOFCharacters | aEOFCharacters;
},
AppendImpliedEOFCharacters(aEOFCharacters, aResult) {
// First, ignore eEOFCharacters_DropBackslash.
let c = aEOFCharacters >> 1;
// All of the remaining EOFCharacters bits represent appended characters,
// and the bits are in the order that they need appending.
for (const p of kImpliedEOFCharacters) {
if (c & 1) {
aResult.push(p);
}
c >>= 1;
}
},
/**
* Consume the part of an URL token after the initial 'url('. Caller
* is assumed to have consumed 'url(' already. Will always produce
* either an URL or a Bad_URL token.
*
* Exposed for use by nsCSSParser::ParseMozDocumentRule, which applies
* the special lexical rules for URL tokens in a nonstandard context.
*/
NextURL(aToken) {
this.SkipWhitespace();
// aToken.mIdent may be "url" at this point; clear that out
aToken.mIdent.length = 0;
let hasString = false;
let ch = this.Peek();
// Do we have a string?
if (ch == QUOTATION_MARK || ch == APOSTROPHE) {
this.ScanString(aToken);
if (aToken.mType == eCSSToken_Bad_String) {
aToken.mType = eCSSToken_Bad_URL;
return;
}
hasString = true;
} else {
// Otherwise, this is the start of a non-quoted url (which may be empty).
aToken.mSymbol = 0;
this.GatherText(IS_URL_CHAR, aToken.mIdent);
}
// Consume trailing whitespace and then look for a close parenthesis.
this.SkipWhitespace();
ch = this.Peek();
// ch can be less than zero indicating EOF
if (ch < 0 || ch == RIGHT_PARENTHESIS) {
this.Advance();
aToken.mType = eCSSToken_URL;
if (ch < 0) {
this.AddEOFCharacters(eEOFCharacters_CloseParen);
}
} else {
aToken.mType = eCSSToken_Bad_URL;
if (!hasString) {
// Consume until before the next right parenthesis, which follows
// how <bad-url-token> is consumed in CSS Syntax 3 spec.
// Note that, we only do this when "url(" is not followed by a
// string, because in the spec, "url(" followed by a string is
// handled as a url function rather than a <url-token>, so the
// rest of content before ")" should be consumed in balance,
// which will be done by the parser.
// The closing ")" is not consumed here. It is left to the parser
// so that the parser can handle both cases.
do {
if (IsVertSpace(ch)) {
this.AdvanceLine();
} else {
this.Advance();
}
ch = this.Peek();
} while (ch >= 0 && ch != RIGHT_PARENTHESIS);
}
}
},
/**
* Primary scanner entry point. Consume one token and fill in
* |aToken| accordingly. Will skip over any number of comments first,
* and will also skip over rather than return whitespace and comment
* tokens.
*
* Returns true if it successfully consumed a token, false if EOF has
* been reached. Will always advance the current read position by at
* least one character unless called when already at EOF.
*/
Next(aToken) {
// do this here so we don't have to do it in dozens of other places
aToken.mIdent = [];
aToken.mType = eCSSToken_Symbol;
this.mTokenOffset = this.mOffset;
this.mTokenLineOffset = this.mLineOffset;
this.mTokenLineNumber = this.mLineNumber;
const ch = this.Peek();
if (IsWhitespace(ch)) {
this.SkipWhitespace();
aToken.mType = eCSSToken_Whitespace;
return true;
}
if (
ch == SOLIDUS && // !IsSVGMode() &&
this.Peek(1) == ASTERISK
) {
this.SkipComment();
aToken.mType = eCSSToken_Comment;
return true;
}
// EOF
if (ch < 0) {
return false;
}
// 'u' could be UNICODE-RANGE or an identifier-family token
if (ch == LATIN_SMALL_LETTER_U || ch == LATIN_CAPITAL_LETTER_U) {
const c2 = this.Peek(1);
const c3 = this.Peek(2);
if (c2 == PLUS_SIGN && (IsHexDigit(c3) || c3 == QUESTION_MARK)) {
return this.ScanURange(aToken);
}
return this.ScanIdent(aToken);
}
// identifier family
if (IsIdentStart(ch)) {
return this.ScanIdent(aToken);
}
// number family
if (IsDigit(ch)) {
return this.ScanNumber(aToken);
}
if (ch == FULL_STOP && IsDigit(this.Peek(1))) {
return this.ScanNumber(aToken);
}
if (ch == PLUS_SIGN) {
const c2 = this.Peek(1);
if (IsDigit(c2) || (c2 == FULL_STOP && IsDigit(this.Peek(2)))) {
return this.ScanNumber(aToken);
}
}
// HYPHEN_MINUS can start an identifier-family token, a number-family token,
// or an HTML-comment
if (ch == HYPHEN_MINUS) {
const c2 = this.Peek(1);
const c3 = this.Peek(2);
if (IsIdentStart(c2) || (c2 == HYPHEN_MINUS && c3 != GREATER_THAN_SIGN)) {
return this.ScanIdent(aToken);
}
if (IsDigit(c2) || (c2 == FULL_STOP && IsDigit(c3))) {
return this.ScanNumber(aToken);
}
if (c2 == HYPHEN_MINUS && c3 == GREATER_THAN_SIGN) {
this.Advance(3);
aToken.mType = eCSSToken_HTMLComment;
aToken.mIdent = stringToCodes("-->");
return true;
}
}
// the other HTML-comment token
if (
ch == LESS_THAN_SIGN &&
this.Peek(1) == EXCLAMATION_MARK &&
this.Peek(2) == HYPHEN_MINUS &&
this.Peek(3) == HYPHEN_MINUS
) {
this.Advance(4);
aToken.mType = eCSSToken_HTMLComment;
aToken.mIdent = stringToCodes("<!--");
return true;
}
// AT_KEYWORD
if (ch == COMMERCIAL_AT) {
return this.ScanAtKeyword(aToken);
}
// HASH
if (ch == NUMBER_SIGN) {
return this.ScanHash(aToken);
}
// STRING
if (ch == QUOTATION_MARK || ch == APOSTROPHE) {
return this.ScanString(aToken);
}
// Match operators: ~= |= ^= $= *=
const opType = MatchOperatorType(ch);
if (opType != eCSSToken_Symbol && this.Peek(1) == EQUALS_SIGN) {
aToken.mType = opType;
this.Advance(2);
return true;
}
// Otherwise, a symbol (DELIM).
aToken.mSymbol = ch;
this.Advance();
return true;
},
};
/**
* Create and return a new CSS lexer.
*
* @param {String} input the CSS text to lex
* @param {Boolean} useInspectorCSSParser Set to true to use InspectorCSSParser.
* @param {Boolean} trackEOFChars Set to true if performEOFFixup will be called.
* @return {CSSLexer} the new lexer
*/
function getCSSLexer(
input,
useInspectorCSSParser = false,
trackEOFChars = false
) {
if (useInspectorCSSParser) {
return new InspectorCSSParserWrapper(input, { trackEOFChars });
}
return new Scanner(input);
}
exports.getCSSLexer = getCSSLexer;
/**
* Wrapper around InspectorCSSParser.
* Once/if https://github.com/servo/rust-cssparser/pull/374 lands, we can remove this class.
*/
class InspectorCSSParserWrapper {
#offset = 0;
#trackEOFChars;
#eofCharacters = eEOFCharacters_None;
/**
*
* @param {String} input: The CSS text to lex
* @param {Object} options
* @param {Boolean} options.trackEOFChars: Set to true if performEOFFixup will be called.
*/
constructor(input, options = {}) {
this.parser = new InspectorCSSParser(input);
this.#trackEOFChars = options.trackEOFChars;
}
get lineNumber() {
return this.parser.lineNumber;
}
get columnNumber() {
return this.parser.columnNumber;
}
nextToken() {
const token = this.parser.nextToken();
if (!token) {
return token;
}
if (this.#trackEOFChars) {
const { tokenType, text } = token;
const lastChar = text[text.length - 1];
if (tokenType === "Comment" && lastChar !== `/`) {
if (lastChar === `*`) {
this.#eofCharacters = eEOFCharacters_Slash;
} else {
this.#eofCharacters = eEOFCharacters_Asterisk | eEOFCharacters_Slash;
}
} else if (tokenType === "QuotedString" || tokenType === "BadString") {
if (lastChar === "\\") {
this.#eofCharacters =
this.#eofCharacters | eEOFCharacters_DropBackslash;
}
if (text[0] !== lastChar) {
this.#eofCharacters =
this.#eofCharacters |
(text[0] === `"`
? eEOFCharacters_DoubleQuote
: eEOFCharacters_SingleQuote);
}
} else {
if (lastChar === "\\") {
this.#eofCharacters = eEOFCharacters_ReplacementChar;
}
// For some reason, we only automatically close `url`, other functions
// will have their opening parenthesis escaped.
if (
(tokenType === "Function" && token.value === "url") ||
tokenType === "BadUrl" ||
(tokenType === "UnquotedUrl" && lastChar !== ")")
) {
this.#eofCharacters = this.#eofCharacters | eEOFCharacters_CloseParen;
}
if (tokenType === "CloseParenthesis") {
this.#eofCharacters =
this.#eofCharacters & ~eEOFCharacters_CloseParen;
}
}
}
// At the moment, InspectorCSSParser doesn't expose offsets, so we need to compute
// them manually here.
// We can do that because we are retrieving every token in the input string, and so the
// end offset of the last token is the start offset of the new token.
token.startOffset = this.#offset;
this.#offset += token.text.length;
token.endOffset = this.#offset;
return token;
}
/**
* When EOF is reached, the last token might be unterminated in some
* ways. This method takes an input string and appends the needed
* terminators. In particular:
*
* 1. If EOF occurs mid-string, this will append the correct quote.
* 2. If EOF occurs in a url token, this will append the close paren.
* 3. If EOF occurs in a comment this will append the comment closer.
*
* A trailing backslash might also have been present in the input
* string. This is handled in different ways, depending on the
* context and arguments.
*
* If preserveBackslash is true, then the existing backslash at the
* end of inputString is preserved, and a new backslash is appended.
* That is, the input |\| is transformed to |\\|, and the
* input |'\| is transformed to |'\\'|.
*
* Otherwise, preserveBackslash is false:
* If the backslash appears in a string context, then the trailing
* backslash is dropped from inputString. That is, |"\| is
* transformed to |""|.
* If the backslash appears outside of a string context, then
* U+FFFD is appended. That is, |\| is transformed to a string
* with two characters: backslash followed by U+FFFD.
*
* Passing false for preserveBackslash makes the result conform to
* the CSS Syntax specification. However, passing true may give
* somewhat more intuitive behavior.
*
* @param inputString the input string
* @param preserveBackslash how to handle trailing backslashes
* @return the input string with the termination characters appended
*/
performEOFFixup(inputString, preserveBackslash) {
let result = inputString;
let eofChars = this.#eofCharacters;
if (
preserveBackslash &&
(eofChars &
(eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar)) !=
0
) {
eofChars &= ~(
eEOFCharacters_DropBackslash | eEOFCharacters_ReplacementChar
);
result += "\\";
}
if (
(eofChars & eEOFCharacters_DropBackslash) != 0 &&
!!result.length &&
result.endsWith("\\")
) {
result = result.slice(0, -1);
}
// First, ignore eEOFCharacters_DropBackslash.
let c = eofChars >> 1;
// All of the remaining EOFCharacters bits represent appended characters,
// and the bits are in the order that they need appending.
for (const p of kImpliedEOFCharacters) {
if (c & 1) {
result += String.fromCharCode(p);
}
c >>= 1;
}
return result;
}
}