Refactor sentence scanning (#1213)

* Update tests

* Update extractSentence implementation

* Remove old extractSentence implementation

* Optimize maps/sets
This commit is contained in:
toasted-nutbread 2021-01-09 19:02:51 -05:00 committed by GitHub
parent 11e9eb2295
commit d698911bc9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 50 deletions

View File

@ -24,6 +24,20 @@
class DocumentUtil { class DocumentUtil {
constructor() { constructor() {
this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/; this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/;
const quoteArray = [
['「', '」'],
['『', '』'],
['\'', '\''],
['"', '"']
];
this._terminatorSet = new Set(['…', '。', '', '.', '', '?', '', '!']);
this._startQuoteMap = new Map();
this._endQuoteMap = new Map();
for (const [char1, char2] of quoteArray) {
this._startQuoteMap.set(char1, char2);
this._endQuoteMap.set(char2, char1);
}
} }
getRangeFromPoint(x, y, deepContentScan) { getRangeFromPoint(x, y, deepContentScan) {
@ -64,72 +78,79 @@ class DocumentUtil {
} }
extractSentence(source, extent, layoutAwareScan) { extractSentence(source, extent, layoutAwareScan) {
const quotesFwd = {'「': '」', '『': '』', "'": "'", '"': '"'}; const terminatorSet = this._terminatorSet;
const quotesBwd = {'」': '「', '』': '『', "'": "'", '"': '"'}; const startQuoteMap = this._startQuoteMap;
const terminators = '…。..?!'; const endQuoteMap = this._endQuoteMap;
const sourceLocal = source.clone(); // Scan text
const position = sourceLocal.setStartOffset(extent, layoutAwareScan); source = source.clone();
sourceLocal.setEndOffset(extent * 2 - position, layoutAwareScan, true); const startLength = source.setStartOffset(extent, layoutAwareScan);
const content = sourceLocal.text(); const endLength = source.setEndOffset(extent * 2 - startLength, layoutAwareScan, true);
const text = source.text();
const textLength = text.length;
const textEndAnchor = textLength - endLength;
let pos1 = startLength;
let pos2 = textEndAnchor;
// Move backward
let quoteStack = []; let quoteStack = [];
for (; pos1 > 0; --pos1) {
const c = text[pos1 - 1];
if (c === '\n') { break; }
let startPos = 0; if (quoteStack.length === 0 && terminatorSet.has(c)) {
for (let i = position; i >= startPos; --i) {
const c = content[i];
if (c === '\n') {
startPos = i + 1;
break; break;
} }
if (quoteStack.length === 0 && (terminators.includes(c) || c in quotesFwd)) { let otherQuote = startQuoteMap.get(c);
startPos = i + 1; if (typeof otherQuote !== 'undefined') {
break; if (quoteStack.length === 0) {
}
if (quoteStack.length > 0 && c === quoteStack[0]) {
quoteStack.pop();
} else if (c in quotesBwd) {
quoteStack.unshift(quotesBwd[c]);
}
}
quoteStack = [];
let endPos = content.length;
for (let i = position; i <= endPos; ++i) {
const c = content[i];
if (c === '\n') {
endPos = i + 1;
break;
}
if (quoteStack.length === 0) {
if (terminators.includes(c)) {
endPos = i + 1;
break;
} else if (c in quotesBwd) {
endPos = i;
break; break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
}
} else {
otherQuote = endQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
} }
} }
}
if (quoteStack.length > 0 && c === quoteStack[0]) { // Move forward
quoteStack.pop(); quoteStack = [];
} else if (c in quotesFwd) { for (; pos2 < textLength; ++pos2) {
quoteStack.unshift(quotesFwd[c]); const c = text[pos2];
if (c === '\n') { break; }
if (quoteStack.length === 0 && terminatorSet.has(c)) {
++pos2;
break;
}
let otherQuote = endQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
if (quoteStack.length === 0) {
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
}
} else {
otherQuote = startQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
}
} }
} }
const text = content.substring(startPos, endPos); // Trim whitespace
const padding = text.length - text.replace(/^\s+/, '').length; for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
// Result
return { return {
text: text.trim(), text: text.substring(pos1, pos2),
offset: position - startPos - padding offset: startLength - pos1
}; };
} }

View File

@ -43,6 +43,54 @@
<span>真白「心配してくださって、ありがとございます」</span> <span>真白「心配してくださって、ありがとございます」</span>
</div> </div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="16"
data-end-node-selector="span"
data-end-offset="16"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="心配して「くださって」、ありがと「ございます」"
>
<span>真白「心配して「くださって」、ありがと「ございます」」</span>
</div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="4"
data-end-node-selector="span"
data-end-offset="4"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="ありがとございます。"
>
<span>ありがとございます。ありがとございます。</span>
</div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="14"
data-end-node-selector="span"
data-end-offset="14"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="ありがとございます。"
>
<span>ありがとございます。ありがとございます。</span>
</div>
<div <div
class="test" class="test"
data-test-type="scan" data-test-type="scan"