Refactor sentence scanning (#1213)
* Update tests * Update extractSentence implementation * Remove old extractSentence implementation * Optimize maps/sets
This commit is contained in:
parent
11e9eb2295
commit
d698911bc9
@ -24,6 +24,20 @@
|
|||||||
class DocumentUtil {
|
class DocumentUtil {
|
||||||
constructor() {
|
constructor() {
|
||||||
this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/;
|
this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/;
|
||||||
|
|
||||||
|
const quoteArray = [
|
||||||
|
['「', '」'],
|
||||||
|
['『', '』'],
|
||||||
|
['\'', '\''],
|
||||||
|
['"', '"']
|
||||||
|
];
|
||||||
|
this._terminatorSet = new Set(['…', '。', '.', '.', '?', '?', '!', '!']);
|
||||||
|
this._startQuoteMap = new Map();
|
||||||
|
this._endQuoteMap = new Map();
|
||||||
|
for (const [char1, char2] of quoteArray) {
|
||||||
|
this._startQuoteMap.set(char1, char2);
|
||||||
|
this._endQuoteMap.set(char2, char1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
getRangeFromPoint(x, y, deepContentScan) {
|
getRangeFromPoint(x, y, deepContentScan) {
|
||||||
@ -64,72 +78,79 @@ class DocumentUtil {
|
|||||||
}
|
}
|
||||||
|
|
||||||
extractSentence(source, extent, layoutAwareScan) {
|
extractSentence(source, extent, layoutAwareScan) {
|
||||||
const quotesFwd = {'「': '」', '『': '』', "'": "'", '"': '"'};
|
const terminatorSet = this._terminatorSet;
|
||||||
const quotesBwd = {'」': '「', '』': '『', "'": "'", '"': '"'};
|
const startQuoteMap = this._startQuoteMap;
|
||||||
const terminators = '…。..??!!';
|
const endQuoteMap = this._endQuoteMap;
|
||||||
|
|
||||||
const sourceLocal = source.clone();
|
// Scan text
|
||||||
const position = sourceLocal.setStartOffset(extent, layoutAwareScan);
|
source = source.clone();
|
||||||
sourceLocal.setEndOffset(extent * 2 - position, layoutAwareScan, true);
|
const startLength = source.setStartOffset(extent, layoutAwareScan);
|
||||||
const content = sourceLocal.text();
|
const endLength = source.setEndOffset(extent * 2 - startLength, layoutAwareScan, true);
|
||||||
|
const text = source.text();
|
||||||
|
const textLength = text.length;
|
||||||
|
const textEndAnchor = textLength - endLength;
|
||||||
|
let pos1 = startLength;
|
||||||
|
let pos2 = textEndAnchor;
|
||||||
|
|
||||||
|
// Move backward
|
||||||
let quoteStack = [];
|
let quoteStack = [];
|
||||||
|
for (; pos1 > 0; --pos1) {
|
||||||
|
const c = text[pos1 - 1];
|
||||||
|
if (c === '\n') { break; }
|
||||||
|
|
||||||
let startPos = 0;
|
if (quoteStack.length === 0 && terminatorSet.has(c)) {
|
||||||
for (let i = position; i >= startPos; --i) {
|
|
||||||
const c = content[i];
|
|
||||||
|
|
||||||
if (c === '\n') {
|
|
||||||
startPos = i + 1;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (quoteStack.length === 0 && (terminators.includes(c) || c in quotesFwd)) {
|
let otherQuote = startQuoteMap.get(c);
|
||||||
startPos = i + 1;
|
if (typeof otherQuote !== 'undefined') {
|
||||||
break;
|
if (quoteStack.length === 0) {
|
||||||
}
|
|
||||||
|
|
||||||
if (quoteStack.length > 0 && c === quoteStack[0]) {
|
|
||||||
quoteStack.pop();
|
|
||||||
} else if (c in quotesBwd) {
|
|
||||||
quoteStack.unshift(quotesBwd[c]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
quoteStack = [];
|
|
||||||
|
|
||||||
let endPos = content.length;
|
|
||||||
for (let i = position; i <= endPos; ++i) {
|
|
||||||
const c = content[i];
|
|
||||||
|
|
||||||
if (c === '\n') {
|
|
||||||
endPos = i + 1;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (quoteStack.length === 0) {
|
|
||||||
if (terminators.includes(c)) {
|
|
||||||
endPos = i + 1;
|
|
||||||
break;
|
|
||||||
} else if (c in quotesBwd) {
|
|
||||||
endPos = i;
|
|
||||||
break;
|
break;
|
||||||
|
} else if (quoteStack[0] === c) {
|
||||||
|
quoteStack.pop();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
otherQuote = endQuoteMap.get(c);
|
||||||
|
if (typeof otherQuote !== 'undefined') {
|
||||||
|
quoteStack.unshift(otherQuote);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (quoteStack.length > 0 && c === quoteStack[0]) {
|
// Move forward
|
||||||
quoteStack.pop();
|
quoteStack = [];
|
||||||
} else if (c in quotesFwd) {
|
for (; pos2 < textLength; ++pos2) {
|
||||||
quoteStack.unshift(quotesFwd[c]);
|
const c = text[pos2];
|
||||||
|
if (c === '\n') { break; }
|
||||||
|
|
||||||
|
if (quoteStack.length === 0 && terminatorSet.has(c)) {
|
||||||
|
++pos2;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
let otherQuote = endQuoteMap.get(c);
|
||||||
|
if (typeof otherQuote !== 'undefined') {
|
||||||
|
if (quoteStack.length === 0) {
|
||||||
|
break;
|
||||||
|
} else if (quoteStack[0] === c) {
|
||||||
|
quoteStack.pop();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
otherQuote = startQuoteMap.get(c);
|
||||||
|
if (typeof otherQuote !== 'undefined') {
|
||||||
|
quoteStack.unshift(otherQuote);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const text = content.substring(startPos, endPos);
|
// Trim whitespace
|
||||||
const padding = text.length - text.replace(/^\s+/, '').length;
|
for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
|
||||||
|
for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
|
||||||
|
|
||||||
|
// Result
|
||||||
return {
|
return {
|
||||||
text: text.trim(),
|
text: text.substring(pos1, pos2),
|
||||||
offset: position - startPos - padding
|
offset: startLength - pos1
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -43,6 +43,54 @@
|
|||||||
<span>真白「心配してくださって、ありがとございます」</span>
|
<span>真白「心配してくださって、ありがとございます」</span>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div
|
||||||
|
class="test"
|
||||||
|
data-test-type="scan"
|
||||||
|
data-element-from-point-selector="span"
|
||||||
|
data-caret-range-from-point-selector="span"
|
||||||
|
data-start-node-selector="span"
|
||||||
|
data-start-offset="16"
|
||||||
|
data-end-node-selector="span"
|
||||||
|
data-end-offset="16"
|
||||||
|
data-result-type="TextSourceRange",
|
||||||
|
data-sentence-extent="100"
|
||||||
|
data-sentence="心配して「くださって」、ありがと「ございます」"
|
||||||
|
>
|
||||||
|
<span>真白「心配して「くださって」、ありがと「ございます」」</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div
|
||||||
|
class="test"
|
||||||
|
data-test-type="scan"
|
||||||
|
data-element-from-point-selector="span"
|
||||||
|
data-caret-range-from-point-selector="span"
|
||||||
|
data-start-node-selector="span"
|
||||||
|
data-start-offset="4"
|
||||||
|
data-end-node-selector="span"
|
||||||
|
data-end-offset="4"
|
||||||
|
data-result-type="TextSourceRange",
|
||||||
|
data-sentence-extent="100"
|
||||||
|
data-sentence="ありがとございます。"
|
||||||
|
>
|
||||||
|
<span>ありがとございます。ありがとございます。</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div
|
||||||
|
class="test"
|
||||||
|
data-test-type="scan"
|
||||||
|
data-element-from-point-selector="span"
|
||||||
|
data-caret-range-from-point-selector="span"
|
||||||
|
data-start-node-selector="span"
|
||||||
|
data-start-offset="14"
|
||||||
|
data-end-node-selector="span"
|
||||||
|
data-end-offset="14"
|
||||||
|
data-result-type="TextSourceRange",
|
||||||
|
data-sentence-extent="100"
|
||||||
|
data-sentence="ありがとございます。"
|
||||||
|
>
|
||||||
|
<span>ありがとございます。ありがとございます。</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div
|
<div
|
||||||
class="test"
|
class="test"
|
||||||
data-test-type="scan"
|
data-test-type="scan"
|
||||||
|
Loading…
Reference in New Issue
Block a user