diff --git a/ext/fg/js/dom-text-scanner.js b/ext/fg/js/dom-text-scanner.js new file mode 100644 index 00000000..2de65041 --- /dev/null +++ b/ext/fg/js/dom-text-scanner.js @@ -0,0 +1,538 @@ +/* + * Copyright (C) 2020 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * A class used to scan text in a document. + */ +class DOMTextScanner { + /** + * Creates a new instance of a DOMTextScanner. + * @param node The DOM Node to start at. + * @param offset The character offset in to start at when node is a text node. + * Use 0 for non-text nodes. + */ + constructor(node, offset, forcePreserveWhitespace=false, generateLayoutContent=true) { + const ruby = DOMTextScanner.getParentRubyElement(node); + const resetOffset = (ruby !== null); + if (resetOffset) { node = ruby; } + + this._node = node; + this._offset = offset; + this._content = ''; + this._remainder = 0; + this._resetOffset = resetOffset; + this._newlines = 0; + this._lineHasWhitespace = false; + this._lineHasContent = false; + this._forcePreserveWhitespace = forcePreserveWhitespace; + this._generateLayoutContent = generateLayoutContent; + } + + /** + * Gets the current node being scanned. + * @returns A DOM Node. + */ + get node() { + return this._node; + } + + /** + * Gets the current offset corresponding to the node being scanned. + * This value is only applicable for text nodes. + * @returns An integer. + */ + get offset() { + return this._offset; + } + + /** + * Gets the accumulated content string resulting from calls to seek(). + * @returns A string. + */ + get content() { + return this._content; + } + + /** + * Seeks a given length in the document and accumulates the text content. + * @param length A positive or negative integer corresponding to how many characters + * should be added to content. Content is only added to the accumulation string, + * never removed, so mixing seek calls with differently signed length values + * may give unexpected results. + * @returns this + */ + seek(length) { + const forward = (length >= 0); + this._remainder = (forward ? length : -length); + if (length === 0) { return this; } + + const TEXT_NODE = Node.TEXT_NODE; + const ELEMENT_NODE = Node.ELEMENT_NODE; + + const generateLayoutContent = this._generateLayoutContent; + let node = this._node; + let resetOffset = this._resetOffset; + let newlines = 0; + while (node !== null) { + let enterable = false; + const nodeType = node.nodeType; + + if (nodeType === TEXT_NODE) { + if (!( + forward ? + this._seekTextNodeForward(node, resetOffset) : + this._seekTextNodeBackward(node, resetOffset) + )) { + // Length reached + break; + } + } else if (nodeType === ELEMENT_NODE) { + [enterable, newlines] = DOMTextScanner.getElementSeekInfo(node); + if (newlines > this._newlines && generateLayoutContent) { + this._newlines = newlines; + } + } + + const exitedNodes = []; + node = DOMTextScanner.getNextNode(node, forward, enterable, exitedNodes); + + for (const exitedNode of exitedNodes) { + if (exitedNode.nodeType !== ELEMENT_NODE) { continue; } + newlines = DOMTextScanner.getElementSeekInfo(exitedNode)[1]; + if (newlines > this._newlines && generateLayoutContent) { + this._newlines = newlines; + } + } + + resetOffset = true; + } + + this._node = node; + this._resetOffset = resetOffset; + + return this; + } + + // Private + + /** + * Seeks forward in a text node. + * @param textNode The text node to use. + * @param resetOffset Whether or not the text offset should be reset. + * @returns true if scanning should continue, or false if the scan length has been reached. + */ + _seekTextNodeForward(textNode, resetOffset) { + const nodeValue = textNode.nodeValue; + const nodeValueLength = nodeValue.length; + const [preserveNewlines, preserveWhitespace] = ( + this._forcePreserveWhitespace ? + [true, true] : + DOMTextScanner.getWhitespaceSettings(textNode) + ); + + let lineHasWhitespace = this._lineHasWhitespace; + let lineHasContent = this._lineHasContent; + let content = this._content; + let offset = resetOffset ? 0 : this._offset; + let remainder = this._remainder; + let newlines = this._newlines; + + while (offset < nodeValueLength) { + const char = nodeValue[offset]; + const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); + ++offset; + + if (charAttributes === 0) { + // Character should be ignored + continue; + } else if (charAttributes === 1) { + // Character is collapsable whitespace + lineHasWhitespace = true; + } else { + // Character should be added to the content + if (newlines > 0) { + if (content.length > 0) { + const useNewlineCount = Math.min(remainder, newlines); + content += '\n'.repeat(useNewlineCount); + remainder -= useNewlineCount; + newlines -= useNewlineCount; + } else { + newlines = 0; + } + lineHasContent = false; + lineHasWhitespace = false; + if (remainder <= 0) { + --offset; // Revert character offset + break; + } + } + + lineHasContent = (charAttributes === 2); // 3 = character is a newline + + if (lineHasWhitespace) { + if (lineHasContent) { + content += ' '; + lineHasWhitespace = false; + if (--remainder <= 0) { + --offset; // Revert character offset + break; + } + } else { + lineHasWhitespace = false; + } + } + + content += char; + + if (--remainder <= 0) { break; } + } + } + + this._lineHasWhitespace = lineHasWhitespace; + this._lineHasContent = lineHasContent; + this._content = content; + this._offset = offset; + this._remainder = remainder; + this._newlines = newlines; + + return (remainder > 0); + } + + /** + * Seeks backward in a text node. + * This function is nearly the same as _seekTextNodeForward, with the following differences: + * - Iteration condition is reversed to check if offset is greater than 0. + * - offset is reset to nodeValueLength instead of 0. + * - offset is decremented instead of incremented. + * - offset is decremented before getting the character. + * - offset is reverted by incrementing instead of decrementing. + * - content string is prepended instead of appended. + * @param textNode The text node to use. + * @param resetOffset Whether or not the text offset should be reset. + * @returns true if scanning should continue, or false if the scan length has been reached. + */ + _seekTextNodeBackward(textNode, resetOffset) { + const nodeValue = textNode.nodeValue; + const nodeValueLength = nodeValue.length; + const [preserveNewlines, preserveWhitespace] = ( + this._forcePreserveWhitespace ? + [true, true] : + DOMTextScanner.getWhitespaceSettings(textNode) + ); + + let lineHasWhitespace = this._lineHasWhitespace; + let lineHasContent = this._lineHasContent; + let content = this._content; + let offset = resetOffset ? nodeValueLength : this._offset; + let remainder = this._remainder; + let newlines = this._newlines; + + while (offset > 0) { + --offset; + const char = nodeValue[offset]; + const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); + + if (charAttributes === 0) { + // Character should be ignored + continue; + } else if (charAttributes === 1) { + // Character is collapsable whitespace + lineHasWhitespace = true; + } else { + // Character should be added to the content + if (newlines > 0) { + if (content.length > 0) { + const useNewlineCount = Math.min(remainder, newlines); + content = '\n'.repeat(useNewlineCount) + content; + remainder -= useNewlineCount; + newlines -= useNewlineCount; + } else { + newlines = 0; + } + lineHasContent = false; + lineHasWhitespace = false; + if (remainder <= 0) { + ++offset; // Revert character offset + break; + } + } + + lineHasContent = (charAttributes === 2); // 3 = character is a newline + + if (lineHasWhitespace) { + if (lineHasContent) { + content = ' ' + content; + lineHasWhitespace = false; + if (--remainder <= 0) { + ++offset; // Revert character offset + break; + } + } else { + lineHasWhitespace = false; + } + } + + content = char + content; + + if (--remainder <= 0) { break; } + } + } + + this._lineHasWhitespace = lineHasWhitespace; + this._lineHasContent = lineHasContent; + this._content = content; + this._offset = offset; + this._remainder = remainder; + this._newlines = newlines; + + return (remainder > 0); + } + + // Static helpers + + /** + * Gets the next node in the document for a specified scanning direction. + * @param node The current DOM Node. + * @param forward Whether to scan forward in the document or backward. + * @param visitChildren Whether the children of the current node should be visited. + * @param exitedNodes An array which stores nodes which were exited. + * @returns The next node in the document, or null if there is no next node. + */ + static getNextNode(node, forward, visitChildren, exitedNodes) { + let next = visitChildren ? (forward ? node.firstChild : node.lastChild) : null; + if (next === null) { + while (true) { + exitedNodes.push(node); + + next = (forward ? node.nextSibling : node.previousSibling); + if (next !== null) { break; } + + next = node.parentNode; + if (next === null) { break; } + + node = next; + } + } + return next; + } + + /** + * Gets the parent element of a given Node. + * @param node The node to check. + * @returns The parent element if one exists, otherwise null. + */ + static getParentElement(node) { + while (node !== null && node.nodeType !== Node.ELEMENT_NODE) { + node = node.parentNode; + } + return node; + } + + /** + * Gets the parent element of a given node, if one exists. For efficiency purposes, + * this only checks the immediate parent elements and does not check all ancestors, so + * there are cases where the node may be in a ruby element but it is not returned. + * @param node The node to check. + * @returns A node if the input node is contained in one, otherwise null. + */ + static getParentRubyElement(node) { + node = DOMTextScanner.getParentElement(node); + if (node !== null && node.nodeName.toUpperCase() === 'RT') { + node = node.parentNode; + if (node !== null && node.nodeName.toUpperCase() === 'RUBY') { + return node; + } + } + return null; + } + + /** + * @returns [enterable: boolean, newlines: integer] + * The enterable value indicates whether the content of this node should be entered. + * The newlines value corresponds to the number of newline characters that should be added. + * 1 newline corresponds to a simple new line in the layout. + * 2 newlines corresponds to a significant visual distinction since the previous content. + */ + static getElementSeekInfo(element) { + let enterable = true; + switch (element.nodeName.toUpperCase()) { + case 'HEAD': + case 'RT': + case 'SCRIPT': + case 'STYLE': + return [false, 0]; + case 'BR': + return [false, 1]; + case 'TEXTAREA': + case 'INPUT': + case 'BUTTON': + enterable = false; + break; + } + + const style = window.getComputedStyle(element); + const display = style.display; + + const visible = (display !== 'none' && DOMTextScanner.isStyleVisible(style)); + let newlines = 0; + + if (!visible) { + enterable = false; + } else { + switch (style.position) { + case 'absolute': + case 'fixed': + case 'sticky': + newlines = 2; + break; + } + if (newlines === 0 && DOMTextScanner.doesCSSDisplayChangeLayout(display)) { + newlines = 1; + } + } + + return [enterable, newlines]; + } + + /** + * Gets information about how whitespace characters are treated. + * @param textNode The Text node to check. + * @returns [preserveNewlines: boolean, preserveWhitespace: boolean] + * The value of preserveNewlines indicates whether or not newline characters are treated as line breaks. + * The value of preserveWhitespace indicates whether or not sequences of whitespace characters are collapsed. + */ + static getWhitespaceSettings(textNode) { + const element = DOMTextScanner.getParentElement(textNode); + if (element !== null) { + const style = window.getComputedStyle(element); + switch (style.whiteSpace) { + case 'pre': + case 'pre-wrap': + case 'break-spaces': + return [true, true]; + case 'pre-line': + return [true, false]; + } + } + return [false, false]; + } + + /** + * Gets attributes for the specified character. + * @param character A string containing a single character. + * @returns An integer representing the attributes of the character. + * 0: Character should be ignored. + * 1: Character is collapsable whitespace. + * 2: Character should be added to the content. + * 3: Character should be added to the content and is a newline. + */ + static getCharacterAttributes(character, preserveNewlines, preserveWhitespace) { + switch (character.charCodeAt(0)) { + case 0x09: // Tab ('\t') + case 0x0c: // Form feed ('\f') + case 0x0d: // Carriage return ('\r') + case 0x20: // Space (' ') + return preserveWhitespace ? 2 : 1; + case 0x0a: // Line feed ('\n') + return preserveNewlines ? 3 : 1; + case 0x200c: // Zero-width non-joiner ('\u200c') + return 0; + default: // Other + return 2; + } + } + + /** + * Checks whether a given style is visible or not. + * This function does not check style.display === 'none'. + * @param style An object implementing the CSSStyleDeclaration interface. + * @returns true if the style should result in an element being visible, otherwise false. + */ + static isStyleVisible(style) { + return !( + style.visibility === 'hidden' || + parseFloat(style.opacity) <= 0 || + parseFloat(style.fontSize) <= 0 || + ( + !DOMTextScanner.isStyleSelectable(style) && + ( + DOMTextScanner.isCSSColorTransparent(style.color) || + DOMTextScanner.isCSSColorTransparent(style.webkitTextFillColor) + ) + ) + ); + } + + /** + * Checks whether a given style is selectable or not. + * @param style An object implementing the CSSStyleDeclaration interface. + * @returns true if the style is selectable, otherwise false. + */ + static isStyleSelectable(style) { + return !( + style.userSelect === 'none' || + style.webkitUserSelect === 'none' || + style.MozUserSelect === 'none' || + style.msUserSelect === 'none' + ); + } + + /** + * Checks whether a CSS color is transparent or not. + * @param cssColor A CSS color string, expected to be encoded in rgb(a) form. + * @returns true if the color is transparent, otherwise false. + */ + static isCSSColorTransparent(cssColor) { + return ( + typeof cssColor === 'string' && + cssColor.startsWith('rgba(') && + /,\s*0.?0*\)$/.test(cssColor) + ); + } + + /** + * Checks whether a CSS display value will cause a layout change for text. + * @param cssDisplay A CSS string corresponding to the value of the display property. + * @returns true if the layout is changed by this value, otherwise false. + */ + static doesCSSDisplayChangeLayout(cssDisplay) { + let pos = cssDisplay.indexOf(' '); + if (pos >= 0) { + // Truncate to part + cssDisplay = cssDisplay.substring(0, pos); + } + + pos = cssDisplay.indexOf('-'); + if (pos >= 0) { + // Truncate to first part of kebab-case value + cssDisplay = cssDisplay.substring(0, pos); + } + + switch (cssDisplay) { + case 'block': + case 'flex': + case 'grid': + case 'list': // list-item + case 'table': // table, table-* + return true; + case 'ruby': // rubt-* + return (pos >= 0); + default: + return false; + } + } +} diff --git a/test/data/html/test-dom-text-scanner.html b/test/data/html/test-dom-text-scanner.html new file mode 100644 index 00000000..6b78570a --- /dev/null +++ b/test/data/html/test-dom-text-scanner.html @@ -0,0 +1,393 @@ + + + + + + Yomichan DOMTextScanner Tests + + + + + +

Yomichan DOMTextScanner Tests

+ + + Layout newlines expected due to entering and exiting display:block nodes. +
小ぢん
まり1
+
小ぢん
まり2
+
+ + + Layout newline expected due to sequential display:block elements. +
小ぢんまり1
小ぢんまり2
+
+ + + Layout newline expected due to sequential display:block elements separated by a newline. +
小ぢんまり1
+
小ぢんまり2
+
+ + + No newlines expected due to display:inline. +小ぢんまり1小ぢんまり2 + + + + No newlines expected due to white-space:normal. +小ぢんまり1 +小ぢんまり2 + + + + Newline expected due to white-space:pre. +
+小ぢんまり1
+小ぢんまり2
+
+
+ + + No newlines expected due to display:inline-block. Actual layout flow cannot be determined by DOM/CSS alone. +小ぢんまり1小ぢんまり2 + + + + Single newline expected due to display:block layout. +
小ぢんまり1
小ぢんまり2
+
+ + + Two newlines expected due to position:absolute causing a significant layout change. +
小ぢんまり1
小ぢんまり2
+
+ + + Two newlines expected due to position:fixed causing a significant layout change. +
小ぢんまり1
小ぢんまり2
+
+ + + Two newlines expected due to position:sticky being able to cause a significant layout change. +
小ぢんまり1
小ぢんまり2
+
+ + + Scanning text starting in an <rt> element. Should start scanning at the start of the <ruby> tag instead. +
()ぢんまり1
+
+ + + Skip <script> content. +
小ぢんまり1
+
+ + + Skip <style> content. +
小ぢんまり1
+
+ + + Skip <textarea> content. +
小ぢんまり1
+
+ + + Skip <input> content. +
小ぢんまり1
+
+ + + Skip <button> content. +
小ぢんまり1
+
+ + + Skip content with font-size:0. +
小ぢんcontentまり1
+
+ + + Skip content with opacity:0. +
小ぢんcontentまり1
+
+ + + Skip content with visibility:hidden. +
小ぢんcontentまり1
+
+ + + Skip content with display:none. +
小ぢんcontentまり1
+
+ + + Don't skip content with user-select:none. +
小ぢまり1
+
+ + + Skip content with user-select:none and a transparent color. +
小ぢんcontentまり1
+
+ + + \ No newline at end of file diff --git a/test/data/html/test-stylesheet.css b/test/data/html/test-stylesheet.css index f63d2481..2e9a2f52 100644 --- a/test/data/html/test-stylesheet.css +++ b/test/data/html/test-stylesheet.css @@ -28,7 +28,9 @@ a, a:visited { text-decoration: underline; } -.test { +.test, +y-test { + display: block; background-color: #ffffff; margin: 1em 0; padding: 0.5em; @@ -36,7 +38,8 @@ a, a:visited { border-radius: 4px; } -.test:before { +.test:before, +y-test:before { content: "Test " counter(test-id); display: block; counter-increment: test-id; @@ -45,7 +48,10 @@ a, a:visited { font-weight: bold; } -.description { +.description, +y-description { color: #444444; font-style: italic; + display: block; + padding-bottom: 0.5em; } diff --git a/test/test-dom-text-scanner.js b/test/test-dom-text-scanner.js new file mode 100644 index 00000000..41d6e307 --- /dev/null +++ b/test/test-dom-text-scanner.js @@ -0,0 +1,181 @@ +/* + * Copyright (C) 2020 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const fs = require('fs'); +const path = require('path'); +const assert = require('assert'); +const {JSDOM} = require('jsdom'); +const {VM} = require('./yomichan-vm'); + + +function createJSDOM(fileName) { + const domSource = fs.readFileSync(fileName, {encoding: 'utf8'}); + return new JSDOM(domSource); +} + +function querySelectorTextNode(element, selector) { + let textIndex = -1; + const match = /::text$|::nth-text\((\d+)\)$/.exec(selector); + if (match !== null) { + textIndex = (match[1] ? parseInt(match[1], 10) - 1 : 0); + selector = selector.substring(0, selector.length - match[0].length); + } + const result = element.querySelector(selector); + if (textIndex < 0) { + return result; + } + for (let n = result.firstChild; n !== null; n = n.nextSibling) { + if (n.nodeType === n.constructor.TEXT_NODE) { + if (textIndex === 0) { + return n; + } + --textIndex; + } + } + return null; +} + + +function getComputedFontSizeInPixels(window, getComputedStyle, element) { + for (; element !== null; element = element.parentNode) { + if (element.nodeType === window.Node.ELEMENT_NODE) { + const fontSize = getComputedStyle(element).fontSize; + if (fontSize.endsWith('px')) { + const value = parseFloat(fontSize.substring(0, fontSize.length - 2)); + return value; + } + } + } + const defaultFontSize = 14; + return defaultFontSize; +} + +function createAbsoluteGetComputedStyle(window) { + // Wrapper to convert em units to px units + const getComputedStyleOld = window.getComputedStyle.bind(window); + return (element, ...args) => { + const style = getComputedStyleOld(element, ...args); + return new Proxy(style, { + get: (target, property) => { + let result = target[property]; + if (typeof result === 'string') { + result = result.replace(/([-+]?\d(?:\.\d)?(?:[eE][-+]?\d+)?)em/g, (g0, g1) => { + const fontSize = getComputedFontSizeInPixels(window, getComputedStyleOld, element); + return `${parseFloat(g1) * fontSize}px`; + }); + } + return result; + } + }); + }; +} + + +async function testDomTextScanner(dom, {DOMTextScanner}) { + const document = dom.window.document; + for (const testElement of document.querySelectorAll('y-test')) { + let testData = JSON.parse(testElement.dataset.testData); + if (!Array.isArray(testData)) { + testData = [testData]; + } + for (const testDataItem of testData) { + let { + node, + offset, + length, + forcePreserveWhitespace, + generateLayoutContent, + reversible, + expected: { + node: expectedNode, + offset: expectedOffset, + content: expectedContent + } + } = testDataItem; + + node = querySelectorTextNode(testElement, node); + expectedNode = querySelectorTextNode(testElement, expectedNode); + + // Standard test + { + const scanner = new DOMTextScanner(node, offset, forcePreserveWhitespace, generateLayoutContent); + scanner.seek(length); + + const {node: actualNode1, offset: actualOffset1, content: actualContent1} = scanner; + assert.strictEqual(actualContent1, expectedContent); + assert.strictEqual(actualOffset1, expectedOffset); + assert.strictEqual(actualNode1, expectedNode); + } + + // Substring tests + for (let i = 1; i <= length; ++i) { + const scanner = new DOMTextScanner(node, offset, forcePreserveWhitespace, generateLayoutContent); + scanner.seek(length - i); + + const {content: actualContent} = scanner; + assert.strictEqual(actualContent, expectedContent.substring(0, expectedContent.length - i)); + } + + if (reversible === false) { continue; } + + // Reversed test + { + const scanner = new DOMTextScanner(expectedNode, expectedOffset, forcePreserveWhitespace, generateLayoutContent); + scanner.seek(-length); + + const {content: actualContent} = scanner; + assert.strictEqual(actualContent, expectedContent); + } + + // Reversed substring tests + for (let i = 1; i <= length; ++i) { + const scanner = new DOMTextScanner(expectedNode, expectedOffset, forcePreserveWhitespace, generateLayoutContent); + scanner.seek(-(length - i)); + + const {content: actualContent} = scanner; + assert.strictEqual(actualContent, expectedContent.substring(i)); + } + } + } +} + + +async function testDocument1() { + const dom = createJSDOM(path.join(__dirname, 'data', 'html', 'test-dom-text-scanner.html')); + const window = dom.window; + try { + const {document, Node, Range} = window; + + window.getComputedStyle = createAbsoluteGetComputedStyle(window); + + const vm = new VM({document, window, Range, Node}); + vm.execute('fg/js/dom-text-scanner.js'); + const DOMTextScanner = vm.get('DOMTextScanner'); + + await testDomTextScanner(dom, {DOMTextScanner}); + } finally { + window.close(); + } +} + + +async function main() { + await testDocument1(); +} + + +if (require.main === module) { main(); }