Refactor sentence parsing (#1215)

* Rename sentenceExtent with sentenceScanExtent

* Update TextScanner.setOptions

* Change function argument order

* Rename quote map variables

* Fix edge case quote handling

* Update terminator maps to support character inclusion
This commit is contained in:
toasted-nutbread 2021-01-09 23:10:55 -05:00 committed by GitHub
parent da1e1e5c5b
commit 083da93142
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 89 additions and 63 deletions

View File

@ -312,7 +312,7 @@ class Frontend {
async _updateOptionsInternal() { async _updateOptionsInternal() {
const optionsContext = await this._getOptionsContext(); const optionsContext = await this._getOptionsContext();
const options = await api.optionsGet(optionsContext); const options = await api.optionsGet(optionsContext);
const scanningOptions = options.scanning; const {scanning: scanningOptions, sentenceParsing: sentenceParsingOptions} = options;
this._options = options; this._options = options;
await this._updatePopup(); await this._updatePopup();
@ -326,7 +326,7 @@ class Frontend {
touchInputEnabled: scanningOptions.touchInputEnabled, touchInputEnabled: scanningOptions.touchInputEnabled,
pointerEventsEnabled: scanningOptions.pointerEventsEnabled, pointerEventsEnabled: scanningOptions.pointerEventsEnabled,
scanLength: scanningOptions.length, scanLength: scanningOptions.length,
sentenceExtent: options.sentenceParsing.scanExtent, sentenceScanExtent: sentenceParsingOptions.scanExtent,
layoutAwareScan: scanningOptions.layoutAwareScan, layoutAwareScan: scanningOptions.layoutAwareScan,
preventMiddleMouse preventMiddleMouse
}); });

View File

@ -309,7 +309,7 @@ class Display extends EventDispatcher {
async updateOptions() { async updateOptions() {
const options = await api.optionsGet(this.getOptionsContext()); const options = await api.optionsGet(this.getOptionsContext());
const scanning = options.scanning; const {scanning: scanningOptions, sentenceParsing: sentenceParsingOptions} = options;
this._options = options; this._options = options;
this._updateDocumentOptions(options); this._updateDocumentOptions(options);
@ -320,16 +320,16 @@ class Display extends EventDispatcher {
selectedParser: options.parsing.selectedParser, selectedParser: options.parsing.selectedParser,
termSpacing: options.parsing.termSpacing, termSpacing: options.parsing.termSpacing,
scanning: { scanning: {
inputs: scanning.inputs, inputs: scanningOptions.inputs,
deepContentScan: scanning.deepDomScan, deepContentScan: scanningOptions.deepDomScan,
selectText: scanning.selectText, selectText: scanningOptions.selectText,
delay: scanning.delay, delay: scanningOptions.delay,
touchInputEnabled: scanning.touchInputEnabled, touchInputEnabled: scanningOptions.touchInputEnabled,
pointerEventsEnabled: scanning.pointerEventsEnabled, pointerEventsEnabled: scanningOptions.pointerEventsEnabled,
scanLength: scanning.length, scanLength: scanningOptions.length,
sentenceExtent: options.sentenceParsing.scanExtent, sentenceScanExtent: sentenceParsingOptions.scanExtent,
layoutAwareScan: scanning.layoutAwareScan, layoutAwareScan: scanningOptions.layoutAwareScan,
preventMiddleMouse: scanning.preventMiddleMouse.onSearchQuery preventMiddleMouse: scanningOptions.preventMiddleMouse.onSearchQuery
} }
}); });
@ -1810,7 +1810,7 @@ class Display extends EventDispatcher {
this._definitionTextScanner.on('searched', this._onDefinitionTextScannerSearched.bind(this)); this._definitionTextScanner.on('searched', this._onDefinitionTextScannerSearched.bind(this));
} }
const scanningOptions = options.scanning; const {scanning: scanningOptions, sentenceParsing: sentenceParsingOptions} = options;
this._definitionTextScanner.setOptions({ this._definitionTextScanner.setOptions({
inputs: [{ inputs: [{
include: 'mouse0', include: 'mouse0',
@ -1832,7 +1832,7 @@ class Display extends EventDispatcher {
touchInputEnabled: false, touchInputEnabled: false,
pointerEventsEnabled: false, pointerEventsEnabled: false,
scanLength: scanningOptions.length, scanLength: scanningOptions.length,
sentenceExtent: options.sentenceParsing.scanExtent, sentenceScanExtent: sentenceParsingOptions.scanExtent,
layoutAwareScan: scanningOptions.layoutAwareScan, layoutAwareScan: scanningOptions.layoutAwareScan,
preventMiddleMouse: false preventMiddleMouse: false
}); });

View File

@ -31,12 +31,16 @@ class DocumentUtil {
['\'', '\''], ['\'', '\''],
['"', '"'] ['"', '"']
]; ];
this._terminatorSet = new Set(['…', '。', '', '.', '', '?', '', '!']); const terminatorString = '…。..?!';
this._startQuoteMap = new Map(); this._terminatorMap = new Map();
this._endQuoteMap = new Map(); for (const char of terminatorString) {
this._terminatorMap.set(char, [false, true]);
}
this._forwardQuoteMap = new Map();
this._backwardQuoteMap = new Map();
for (const [char1, char2] of quoteArray) { for (const [char1, char2] of quoteArray) {
this._startQuoteMap.set(char1, char2); this._forwardQuoteMap.set(char1, [char2, false]);
this._endQuoteMap.set(char2, char1); this._backwardQuoteMap.set(char2, [char1, false]);
} }
} }
@ -77,10 +81,10 @@ class DocumentUtil {
} }
} }
extractSentence(source, extent, layoutAwareScan) { extractSentence(source, layoutAwareScan, extent) {
const terminatorSet = this._terminatorSet; const terminatorMap = this._terminatorMap;
const startQuoteMap = this._startQuoteMap; const forwardQuoteMap = this._forwardQuoteMap;
const endQuoteMap = this._endQuoteMap; const backwardQuoteMap = this._backwardQuoteMap;
// Scan text // Scan text
source = source.clone(); source = source.clone();
@ -98,22 +102,28 @@ class DocumentUtil {
const c = text[pos1 - 1]; const c = text[pos1 - 1];
if (c === '\n') { break; } if (c === '\n') { break; }
if (quoteStack.length === 0 && terminatorSet.has(c)) { if (quoteStack.length === 0) {
const terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo !== 'undefined') {
if (terminatorInfo[0]) { --pos1; }
break; break;
} }
}
let otherQuote = startQuoteMap.get(c); let quoteInfo = forwardQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') { if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) { if (quoteStack.length === 0) {
if (quoteInfo[1]) { --pos1; }
break; break;
} else if (quoteStack[0] === c) { } else if (quoteStack[0] === c) {
quoteStack.pop(); quoteStack.pop();
continue;
} }
} else {
otherQuote = endQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
} }
quoteInfo = backwardQuoteMap.get(c);
if (typeof quoteInfo !== 'undefined') {
quoteStack.unshift(quoteInfo[0]);
} }
} }
@ -123,23 +133,28 @@ class DocumentUtil {
const c = text[pos2]; const c = text[pos2];
if (c === '\n') { break; } if (c === '\n') { break; }
if (quoteStack.length === 0 && terminatorSet.has(c)) { if (quoteStack.length === 0) {
++pos2; const terminatorInfo = terminatorMap.get(c);
if (typeof terminatorInfo !== 'undefined') {
if (terminatorInfo[1]) { ++pos2; }
break; break;
} }
}
let otherQuote = endQuoteMap.get(c); let quoteInfo = backwardQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') { if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) { if (quoteStack.length === 0) {
if (quoteInfo[1]) { ++pos2; }
break; break;
} else if (quoteStack[0] === c) { } else if (quoteStack[0] === c) {
quoteStack.pop(); quoteStack.pop();
continue;
} }
} else {
otherQuote = startQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
} }
quoteInfo = forwardQuoteMap.get(c);
if (typeof quoteInfo !== 'undefined') {
quoteStack.unshift(quoteInfo[0]);
} }
} }

View File

@ -59,7 +59,7 @@ class TextScanner extends EventDispatcher {
this._touchInputEnabled = false; this._touchInputEnabled = false;
this._pointerEventsEnabled = false; this._pointerEventsEnabled = false;
this._scanLength = 1; this._scanLength = 1;
this._sentenceExtent = 1; this._sentenceScanExtent = 1;
this._layoutAwareScan = false; this._layoutAwareScan = false;
this._preventMiddleMouse = false; this._preventMiddleMouse = false;
this._inputs = []; this._inputs = [];
@ -134,7 +134,18 @@ class TextScanner extends EventDispatcher {
} }
} }
setOptions({inputs, deepContentScan, selectText, delay, touchInputEnabled, pointerEventsEnabled, scanLength, sentenceExtent, layoutAwareScan, preventMiddleMouse}) { setOptions({
inputs,
deepContentScan,
selectText,
delay,
touchInputEnabled,
pointerEventsEnabled,
scanLength,
sentenceScanExtent,
layoutAwareScan,
preventMiddleMouse
}) {
if (Array.isArray(inputs)) { if (Array.isArray(inputs)) {
this._inputs = inputs.map(({ this._inputs = inputs.map(({
include, include,
@ -182,8 +193,8 @@ class TextScanner extends EventDispatcher {
if (typeof scanLength === 'number') { if (typeof scanLength === 'number') {
this._scanLength = scanLength; this._scanLength = scanLength;
} }
if (typeof sentenceExtent === 'number') { if (typeof sentenceScanExtent === 'number') {
this._sentenceExtent = sentenceExtent; this._sentenceScanExtent = sentenceScanExtent;
} }
if (typeof layoutAwareScan === 'boolean') { if (typeof layoutAwareScan === 'boolean') {
this._layoutAwareScan = layoutAwareScan; this._layoutAwareScan = layoutAwareScan;
@ -711,7 +722,7 @@ class TextScanner extends EventDispatcher {
async _findTerms(textSource, optionsContext) { async _findTerms(textSource, optionsContext) {
const scanLength = this._scanLength; const scanLength = this._scanLength;
const sentenceExtent = this._sentenceExtent; const sentenceScanExtent = this._sentenceScanExtent;
const layoutAwareScan = this._layoutAwareScan; const layoutAwareScan = this._layoutAwareScan;
const searchText = this.getTextSourceContent(textSource, scanLength, layoutAwareScan); const searchText = this.getTextSourceContent(textSource, scanLength, layoutAwareScan);
if (searchText.length === 0) { return null; } if (searchText.length === 0) { return null; }
@ -720,13 +731,13 @@ class TextScanner extends EventDispatcher {
if (definitions.length === 0) { return null; } if (definitions.length === 0) { return null; }
textSource.setEndOffset(length, layoutAwareScan); textSource.setEndOffset(length, layoutAwareScan);
const sentence = this._documentUtil.extractSentence(textSource, sentenceExtent, layoutAwareScan); const sentence = this._documentUtil.extractSentence(textSource, layoutAwareScan, sentenceScanExtent);
return {definitions, sentence, type: 'terms'}; return {definitions, sentence, type: 'terms'};
} }
async _findKanji(textSource, optionsContext) { async _findKanji(textSource, optionsContext) {
const sentenceExtent = this._sentenceExtent; const sentenceScanExtent = this._sentenceScanExtent;
const layoutAwareScan = this._layoutAwareScan; const layoutAwareScan = this._layoutAwareScan;
const searchText = this.getTextSourceContent(textSource, 1, layoutAwareScan); const searchText = this.getTextSourceContent(textSource, 1, layoutAwareScan);
if (searchText.length === 0) { return null; } if (searchText.length === 0) { return null; }
@ -735,7 +746,7 @@ class TextScanner extends EventDispatcher {
if (definitions.length === 0) { return null; } if (definitions.length === 0) { return null; }
textSource.setEndOffset(1, layoutAwareScan); textSource.setEndOffset(1, layoutAwareScan);
const sentence = this._documentUtil.extractSentence(textSource, sentenceExtent, layoutAwareScan); const sentence = this._documentUtil.extractSentence(textSource, layoutAwareScan, sentenceScanExtent);
return {definitions, sentence, type: 'kanji'}; return {definitions, sentence, type: 'kanji'};
} }

View File

@ -21,7 +21,7 @@
data-end-node-selector="span" data-end-node-selector="span"
data-end-offset="0" data-end-offset="0"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="真白「心配してくださって、ありがとございます」" data-sentence="真白「心配してくださって、ありがとございます」"
> >
<span>真白「心配してくださって、ありがとございます」</span> <span>真白「心配してくださって、ありがとございます」</span>
@ -37,7 +37,7 @@
data-end-node-selector="span" data-end-node-selector="span"
data-end-offset="5" data-end-offset="5"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="心配してくださって、ありがとございます" data-sentence="心配してくださって、ありがとございます"
> >
<span>真白「心配してくださって、ありがとございます」</span> <span>真白「心配してくださって、ありがとございます」</span>
@ -53,7 +53,7 @@
data-end-node-selector="span" data-end-node-selector="span"
data-end-offset="16" data-end-offset="16"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="心配して「くださって」、ありがと「ございます」" data-sentence="心配して「くださって」、ありがと「ございます」"
> >
<span>真白「心配して「くださって」、ありがと「ございます」」</span> <span>真白「心配して「くださって」、ありがと「ございます」」</span>
@ -69,7 +69,7 @@
data-end-node-selector="span" data-end-node-selector="span"
data-end-offset="4" data-end-offset="4"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="ありがとございます。" data-sentence="ありがとございます。"
> >
<span>ありがとございます。ありがとございます。</span> <span>ありがとございます。ありがとございます。</span>
@ -85,7 +85,7 @@
data-end-node-selector="span" data-end-node-selector="span"
data-end-offset="14" data-end-offset="14"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="ありがとございます。" data-sentence="ありがとございます。"
> >
<span>ありがとございます。ありがとございます。</span> <span>ありがとございます。ありがとございます。</span>
@ -101,7 +101,7 @@
data-end-node-selector="input" data-end-node-selector="input"
data-end-offset="0" data-end-offset="0"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="真白「心配してくださって、ありがとございます」" data-sentence="真白「心配してくださって、ありがとございます」"
data-has-imposter="true" data-has-imposter="true"
> >
@ -118,7 +118,7 @@
data-end-node-selector="textarea" data-end-node-selector="textarea"
data-end-offset="0" data-end-offset="0"
data-result-type="TextSourceRange", data-result-type="TextSourceRange",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="真白「心配してくださって、ありがとございます」" data-sentence="真白「心配してくださって、ありがとございます」"
data-has-imposter="true" data-has-imposter="true"
> >
@ -135,7 +135,7 @@
data-end-node-selector="button" data-end-node-selector="button"
data-end-offset="0" data-end-offset="0"
data-result-type="TextSourceElement", data-result-type="TextSourceElement",
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="よみちゃん" data-sentence="よみちゃん"
> >
<button style="width: 100%; box-sizing: border-box; font-family: inherit; font-size: inherit; border: 1px solid #d8d8d8; background-color: #f0f0f0; padding: 0.2em;">よみちゃん</button> <button style="width: 100%; box-sizing: border-box; font-family: inherit; font-size: inherit; border: 1px solid #d8d8d8; background-color: #f0f0f0; padding: 0.2em;">よみちゃん</button>
@ -151,7 +151,7 @@
data-end-node-selector="img" data-end-node-selector="img"
data-end-offset="0" data-end-offset="0"
data-result-type="TextSourceElement" data-result-type="TextSourceElement"
data-sentence-extent="100" data-sentence-scan-extent="100"
data-sentence="よみちゃん" data-sentence="よみちゃん"
> >
<img src="data:image/gif;base64,R0lGODdhBwAHAIABAAAAAP///ywAAAAABwAHAAACDIRvEaC32FpCbEkKCgA7" alt="よみちゃん" title="よみちゃん" style="width: 70px; height: 70px; image-rendering: crisp-edges; image-rendering: pixelated; display: block;" /> <img src="data:image/gif;base64,R0lGODdhBwAHAIABAAAAAP///ywAAAAABwAHAAACDIRvEaC32FpCbEkKCgA7" alt="よみちゃん" title="よみちゃん" style="width: 70px; height: 70px; image-rendering: crisp-edges; image-rendering: pixelated; display: block;" />

View File

@ -127,7 +127,7 @@ async function testDocumentTextScanningFunctions(dom, {DocumentUtil, TextSourceR
endNodeSelector, endNodeSelector,
endOffset, endOffset,
resultType, resultType,
sentenceExtent, sentenceScanExtent,
sentence, sentence,
hasImposter hasImposter
} = testElement.dataset; } = testElement.dataset;
@ -139,7 +139,7 @@ async function testDocumentTextScanningFunctions(dom, {DocumentUtil, TextSourceR
startOffset = parseInt(startOffset, 10); startOffset = parseInt(startOffset, 10);
endOffset = parseInt(endOffset, 10); endOffset = parseInt(endOffset, 10);
sentenceExtent = parseInt(sentenceExtent, 10); sentenceScanExtent = parseInt(sentenceScanExtent, 10);
assert.notStrictEqual(elementFromPointValue, null); assert.notStrictEqual(elementFromPointValue, null);
assert.notStrictEqual(caretRangeFromPointValue, null); assert.notStrictEqual(caretRangeFromPointValue, null);
@ -182,7 +182,7 @@ async function testDocumentTextScanningFunctions(dom, {DocumentUtil, TextSourceR
if (source === null) { continue; } if (source === null) { continue; }
// Test docSentenceExtract // Test docSentenceExtract
const sentenceActual = documentUtil.extractSentence(source, sentenceExtent, false).text; const sentenceActual = documentUtil.extractSentence(source, false, sentenceScanExtent).text;
assert.strictEqual(sentenceActual, sentence); assert.strictEqual(sentenceActual, sentence);
// Clean // Clean