Update DOMTextScanner to support UTF-16 surrogate pairs (#2213)

This commit is contained in:
toasted-nutbread 2022-08-20 11:32:20 -04:00 committed by GitHub
parent 5c267f4bb7
commit c5c5308ff2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -145,6 +145,44 @@ class DOMTextScanner {
// Private
/**
* Reads a code point in a string in the forward direction.
* @param {string} text The text to read the code point from.
* @param {number} position The index of the first character to read.
* @returns {string} The code point from the string.
*/
_readCodePointForward(text, position) {
let char = text[position];
const charCode = char.charCodeAt(0);
if (charCode >= 0xd800 && charCode < 0xdc00 && ++position < text.length) {
const char2 = text[position];
const charCode2 = char2.charCodeAt(0);
if (charCode2 >= 0xdc00 && charCode2 < 0xe000) {
char += char2;
}
}
return char;
}
/**
* Reads a code point in a string in the backward direction.
* @param {string} text The text to read the code point from.
* @param {number} position The index of the first character to read.
* @returns {string} The code point from the string.
*/
_readCodePointBackward(text, position) {
let char = text[position];
const charCode = char.charCodeAt(0);
if (charCode >= 0xdc00 && charCode < 0xe000 && position > 0) {
const char2 = text[position - 1];
const charCode2 = char2.charCodeAt(0);
if (charCode2 >= 0xd800 && charCode2 < 0xdc00) {
char = char2 + char;
}
}
return char;
}
/**
* Seeks forward in a text node.
* @param {Text} textNode The text node to use.
@ -164,9 +202,9 @@ class DOMTextScanner {
let newlines = this._newlines;
while (offset < nodeValueLength) {
const char = nodeValue[offset];
const char = this._readCodePointForward(nodeValue, offset);
offset += char.length;
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
++offset;
if (charAttributes === 0) {
// Character should be ignored
@ -188,7 +226,7 @@ class DOMTextScanner {
lineHasContent = false;
lineHasWhitespace = false;
if (remainder <= 0) {
--offset; // Revert character offset
offset -= char.length; // Revert character offset
break;
}
}
@ -200,7 +238,7 @@ class DOMTextScanner {
content += ' ';
lineHasWhitespace = false;
if (--remainder <= 0) {
--offset; // Revert character offset
offset -= char.length; // Revert character offset
break;
}
} else {
@ -250,8 +288,8 @@ class DOMTextScanner {
let newlines = this._newlines;
while (offset > 0) {
--offset;
const char = nodeValue[offset];
const char = this._readCodePointBackward(nodeValue, offset - 1);
offset -= char.length;
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
if (charAttributes === 0) {
@ -274,7 +312,7 @@ class DOMTextScanner {
lineHasContent = false;
lineHasWhitespace = false;
if (remainder <= 0) {
++offset; // Revert character offset
offset += char.length; // Revert character offset
break;
}
}
@ -286,7 +324,7 @@ class DOMTextScanner {
content = ' ' + content;
lineHasWhitespace = false;
if (--remainder <= 0) {
++offset; // Revert character offset
offset += char.length; // Revert character offset
break;
}
} else {