Refactor sentence scanning (#1213)

* Update tests

* Update extractSentence implementation

* Remove old extractSentence implementation

* Optimize maps/sets
This commit is contained in:
toasted-nutbread 2021-01-09 19:02:51 -05:00 committed by GitHub
parent 11e9eb2295
commit d698911bc9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 119 additions and 50 deletions

View File

@ -24,6 +24,20 @@
class DocumentUtil {
constructor() {
this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/;
const quoteArray = [
['「', '」'],
['『', '』'],
['\'', '\''],
['"', '"']
];
this._terminatorSet = new Set(['…', '。', '', '.', '', '?', '', '!']);
this._startQuoteMap = new Map();
this._endQuoteMap = new Map();
for (const [char1, char2] of quoteArray) {
this._startQuoteMap.set(char1, char2);
this._endQuoteMap.set(char2, char1);
}
}
getRangeFromPoint(x, y, deepContentScan) {
@ -64,72 +78,79 @@ class DocumentUtil {
}
extractSentence(source, extent, layoutAwareScan) {
const quotesFwd = {'「': '」', '『': '』', "'": "'", '"': '"'};
const quotesBwd = {'」': '「', '』': '『', "'": "'", '"': '"'};
const terminators = '…。..?!';
const terminatorSet = this._terminatorSet;
const startQuoteMap = this._startQuoteMap;
const endQuoteMap = this._endQuoteMap;
const sourceLocal = source.clone();
const position = sourceLocal.setStartOffset(extent, layoutAwareScan);
sourceLocal.setEndOffset(extent * 2 - position, layoutAwareScan, true);
const content = sourceLocal.text();
// Scan text
source = source.clone();
const startLength = source.setStartOffset(extent, layoutAwareScan);
const endLength = source.setEndOffset(extent * 2 - startLength, layoutAwareScan, true);
const text = source.text();
const textLength = text.length;
const textEndAnchor = textLength - endLength;
let pos1 = startLength;
let pos2 = textEndAnchor;
// Move backward
let quoteStack = [];
for (; pos1 > 0; --pos1) {
const c = text[pos1 - 1];
if (c === '\n') { break; }
let startPos = 0;
for (let i = position; i >= startPos; --i) {
const c = content[i];
if (c === '\n') {
startPos = i + 1;
break;
}
if (quoteStack.length === 0 && (terminators.includes(c) || c in quotesFwd)) {
startPos = i + 1;
break;
}
if (quoteStack.length > 0 && c === quoteStack[0]) {
quoteStack.pop();
} else if (c in quotesBwd) {
quoteStack.unshift(quotesBwd[c]);
}
}
quoteStack = [];
let endPos = content.length;
for (let i = position; i <= endPos; ++i) {
const c = content[i];
if (c === '\n') {
endPos = i + 1;
if (quoteStack.length === 0 && terminatorSet.has(c)) {
break;
}
let otherQuote = startQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
if (quoteStack.length === 0) {
if (terminators.includes(c)) {
endPos = i + 1;
break;
} else if (c in quotesBwd) {
endPos = i;
break;
}
}
if (quoteStack.length > 0 && c === quoteStack[0]) {
} else if (quoteStack[0] === c) {
quoteStack.pop();
} else if (c in quotesFwd) {
quoteStack.unshift(quotesFwd[c]);
}
} else {
otherQuote = endQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
}
}
}
const text = content.substring(startPos, endPos);
const padding = text.length - text.replace(/^\s+/, '').length;
// Move forward
quoteStack = [];
for (; pos2 < textLength; ++pos2) {
const c = text[pos2];
if (c === '\n') { break; }
if (quoteStack.length === 0 && terminatorSet.has(c)) {
++pos2;
break;
}
let otherQuote = endQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
if (quoteStack.length === 0) {
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
}
} else {
otherQuote = startQuoteMap.get(c);
if (typeof otherQuote !== 'undefined') {
quoteStack.unshift(otherQuote);
}
}
}
// Trim whitespace
for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
// Result
return {
text: text.trim(),
offset: position - startPos - padding
text: text.substring(pos1, pos2),
offset: startLength - pos1
};
}

View File

@ -43,6 +43,54 @@
<span>真白「心配してくださって、ありがとございます」</span>
</div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="16"
data-end-node-selector="span"
data-end-offset="16"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="心配して「くださって」、ありがと「ございます」"
>
<span>真白「心配して「くださって」、ありがと「ございます」」</span>
</div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="4"
data-end-node-selector="span"
data-end-offset="4"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="ありがとございます。"
>
<span>ありがとございます。ありがとございます。</span>
</div>
<div
class="test"
data-test-type="scan"
data-element-from-point-selector="span"
data-caret-range-from-point-selector="span"
data-start-node-selector="span"
data-start-offset="14"
data-end-node-selector="span"
data-end-offset="14"
data-result-type="TextSourceRange",
data-sentence-extent="100"
data-sentence="ありがとございます。"
>
<span>ありがとございます。ありがとございます。</span>
</div>
<div
class="test"
data-test-type="scan"