Update DOMTextScanner to support UTF-16 surrogate pairs (#2213)
This commit is contained in:
parent
5c267f4bb7
commit
c5c5308ff2
@ -145,6 +145,44 @@ class DOMTextScanner {
|
||||
|
||||
// Private
|
||||
|
||||
/**
|
||||
* Reads a code point in a string in the forward direction.
|
||||
* @param {string} text The text to read the code point from.
|
||||
* @param {number} position The index of the first character to read.
|
||||
* @returns {string} The code point from the string.
|
||||
*/
|
||||
_readCodePointForward(text, position) {
|
||||
let char = text[position];
|
||||
const charCode = char.charCodeAt(0);
|
||||
if (charCode >= 0xd800 && charCode < 0xdc00 && ++position < text.length) {
|
||||
const char2 = text[position];
|
||||
const charCode2 = char2.charCodeAt(0);
|
||||
if (charCode2 >= 0xdc00 && charCode2 < 0xe000) {
|
||||
char += char2;
|
||||
}
|
||||
}
|
||||
return char;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a code point in a string in the backward direction.
|
||||
* @param {string} text The text to read the code point from.
|
||||
* @param {number} position The index of the first character to read.
|
||||
* @returns {string} The code point from the string.
|
||||
*/
|
||||
_readCodePointBackward(text, position) {
|
||||
let char = text[position];
|
||||
const charCode = char.charCodeAt(0);
|
||||
if (charCode >= 0xdc00 && charCode < 0xe000 && position > 0) {
|
||||
const char2 = text[position - 1];
|
||||
const charCode2 = char2.charCodeAt(0);
|
||||
if (charCode2 >= 0xd800 && charCode2 < 0xdc00) {
|
||||
char = char2 + char;
|
||||
}
|
||||
}
|
||||
return char;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks forward in a text node.
|
||||
* @param {Text} textNode The text node to use.
|
||||
@ -164,9 +202,9 @@ class DOMTextScanner {
|
||||
let newlines = this._newlines;
|
||||
|
||||
while (offset < nodeValueLength) {
|
||||
const char = nodeValue[offset];
|
||||
const char = this._readCodePointForward(nodeValue, offset);
|
||||
offset += char.length;
|
||||
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
|
||||
++offset;
|
||||
|
||||
if (charAttributes === 0) {
|
||||
// Character should be ignored
|
||||
@ -188,7 +226,7 @@ class DOMTextScanner {
|
||||
lineHasContent = false;
|
||||
lineHasWhitespace = false;
|
||||
if (remainder <= 0) {
|
||||
--offset; // Revert character offset
|
||||
offset -= char.length; // Revert character offset
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -200,7 +238,7 @@ class DOMTextScanner {
|
||||
content += ' ';
|
||||
lineHasWhitespace = false;
|
||||
if (--remainder <= 0) {
|
||||
--offset; // Revert character offset
|
||||
offset -= char.length; // Revert character offset
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
@ -250,8 +288,8 @@ class DOMTextScanner {
|
||||
let newlines = this._newlines;
|
||||
|
||||
while (offset > 0) {
|
||||
--offset;
|
||||
const char = nodeValue[offset];
|
||||
const char = this._readCodePointBackward(nodeValue, offset - 1);
|
||||
offset -= char.length;
|
||||
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
|
||||
|
||||
if (charAttributes === 0) {
|
||||
@ -274,7 +312,7 @@ class DOMTextScanner {
|
||||
lineHasContent = false;
|
||||
lineHasWhitespace = false;
|
||||
if (remainder <= 0) {
|
||||
++offset; // Revert character offset
|
||||
offset += char.length; // Revert character offset
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -286,7 +324,7 @@ class DOMTextScanner {
|
||||
content = ' ' + content;
|
||||
lineHasWhitespace = false;
|
||||
if (--remainder <= 0) {
|
||||
++offset; // Revert character offset
|
||||
offset += char.length; // Revert character offset
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
Loading…
Reference in New Issue
Block a user