Frequency dictionary sort (#1938)

* Add sortDictionary/sortDictionaryOrder options

* Update options

* Add API.getTermFrequencies

* Add settings

* Implement frequency dictionary sorting

* Update test

* Update test data

* Fix handling of undefined rank-based frequencies
This commit is contained in:
toasted-nutbread 2021-09-26 11:08:16 -04:00 committed by GitHub
parent 88e71f8223
commit 9899727d7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 603 additions and 8 deletions

View File

@ -2259,10 +2259,14 @@ input[type=number].dictionary-priority {
}
.horizontal-flex.horizontal-flex-nowrap {
flex-wrap: nowrap;
margin-left: 0;
}
.horizontal-flex>* {
margin-left: 0.375em;
}
.horizontal-flex.horizontal-flex-nowrap>*:first-child {
margin-left: 0;
}
.horizontal-flex-fill {
flex-grow: 1;
}

View File

@ -116,7 +116,9 @@
"popupActionBarVisibility",
"popupActionBarLocation",
"frequencyDisplayMode",
"termDisplayMode"
"termDisplayMode",
"sortFrequencyDictionary",
"sortFrequencyDictionaryOrder"
],
"properties": {
"enable": {
@ -284,6 +286,15 @@
"type": "string",
"enum": ["ruby", "ruby-and-reading", "term-and-reading"],
"default": "ruby"
},
"sortFrequencyDictionary": {
"type": ["string", "null"],
"default": null
},
"sortFrequencyDictionaryOrder": {
"type": "string",
"enum": ["ascending", "descending"],
"default": "descending"
}
}
},

View File

@ -125,7 +125,8 @@ class Backend {
['triggerDatabaseUpdated', {async: false, contentScript: true, handler: this._onApiTriggerDatabaseUpdated.bind(this)}],
['testMecab', {async: true, contentScript: true, handler: this._onApiTestMecab.bind(this)}],
['textHasJapaneseCharacters', {async: false, contentScript: true, handler: this._onApiTextHasJapaneseCharacters.bind(this)}],
['documentStart', {async: false, contentScript: true, handler: this._onApiDocumentStart.bind(this)}]
['documentStart', {async: false, contentScript: true, handler: this._onApiDocumentStart.bind(this)}],
['getTermFrequencies', {async: true, contentScript: true, handler: this._onApiGetTermFrequencies.bind(this)}]
]);
this._messageHandlersWithProgress = new Map([
]);
@ -748,6 +749,10 @@ class Backend {
this._updateTabAccessibility(url, tab, frameId);
}
async _onApiGetTermFrequencies({termReadingList, dictionaries}) {
return await this._translator.getTermFrequencies(termReadingList, dictionaries);
}
// Command handlers
async _onCommandOpenSearchPage(params) {
@ -1953,7 +1958,7 @@ class Backend {
const {wildcard} = details;
const enabledDictionaryMap = this._getTranslatorEnabledDictionaryMap(options);
const {
general: {mainDictionary},
general: {mainDictionary, sortFrequencyDictionary, sortFrequencyDictionaryOrder},
scanning: {alphanumeric},
translation: {
convertHalfWidthCharacters,
@ -1979,6 +1984,8 @@ class Backend {
return {
wildcard,
mainDictionary,
sortFrequencyDictionary,
sortFrequencyDictionaryOrder,
removeNonJapaneseCharacters: !alphanumeric,
convertHalfWidthCharacters,
convertNumericCharacters,

View File

@ -168,6 +168,10 @@ class API {
return this._invoke('textHasJapaneseCharacters', {text});
}
getTermFrequencies(termReadingList, dictionaries) {
return this._invoke('getTermFrequencies', {termReadingList, dictionaries});
}
// Utilities
_createActionPort(timeout=5000) {

View File

@ -463,7 +463,8 @@ class OptionsUtil {
{async: false, update: this._updateVersion11.bind(this)},
{async: true, update: this._updateVersion12.bind(this)},
{async: true, update: this._updateVersion13.bind(this)},
{async: false, update: this._updateVersion14.bind(this)}
{async: false, update: this._updateVersion14.bind(this)},
{async: false, update: this._updateVersion15.bind(this)}
];
if (typeof targetVersion === 'number' && targetVersion < result.length) {
result.splice(targetVersion);
@ -876,4 +877,15 @@ class OptionsUtil {
}
return options;
}
_updateVersion15(options) {
// Version 15 changes:
// Added general.sortFrequencyDictionary.
// Added general.sortFrequencyDictionaryOrder.
for (const profile of options.profiles) {
profile.options.general.sortFrequencyDictionary = null;
profile.options.general.sortFrequencyDictionaryOrder = 'descending';
}
return options;
}
}

View File

@ -64,6 +64,8 @@ class Translator {
* {
* wildcard: (enum: null, 'prefix', 'suffix'),
* mainDictionary: (string),
* sortFrequencyDictionary: (null or string),
* sortFrequencyDictionaryOrder: (enum: 'ascending', 'descending'),
* removeNonJapaneseCharacters: (boolean),
* convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'),
* convertNumericCharacters: (enum: 'false', 'true', 'variant'),
@ -92,7 +94,7 @@ class Translator {
* @returns An object of the structure `{dictionaryEntries, originalTextLength}`.
*/
async findTerms(mode, text, options) {
const {enabledDictionaryMap, excludeDictionaryDefinitions} = options;
const {enabledDictionaryMap, excludeDictionaryDefinitions, sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options;
let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options);
switch (mode) {
@ -115,6 +117,9 @@ class Translator {
await this._expandTermTags(dictionaryEntries);
}
if (sortFrequencyDictionary !== null) {
this._updateSortFrequencies(dictionaryEntries, sortFrequencyDictionary, sortFrequencyDictionaryOrder === 'ascending');
}
if (dictionaryEntries.length > 1) {
this._sortTermDictionaryEntries(dictionaryEntries);
}
@ -176,6 +181,48 @@ class Translator {
return dictionaryEntries;
}
/**
* Gets a list of frequency information for a given list of term-reading pairs
* and a list of dictionaries.
* @param termReadingList An array of `{term, reading}` pairs. If reading is null,
* the reading won't be compared.
* @param dictionaries An array of dictionary names.
* @returns An array of objects with the format
* `{term, reading, dictionary, hasReading, frequency}`.
*/
async getTermFrequencies(termReadingList, dictionaries) {
const dictionarySet = new Set();
for (const dictionary of dictionaries) {
dictionarySet.add(dictionary);
}
const termList = termReadingList.map(({term}) => term);
const metas = await this._database.findTermMetaBulk(termList, dictionarySet);
const results = [];
for (const {mode, data, dictionary, index} of metas) {
if (mode !== 'freq') { continue; }
let {term, reading} = termReadingList[index];
let frequency = data;
const hasReading = (data !== null && typeof data === 'object');
if (hasReading) {
if (data.reading !== reading) {
if (reading !== null) { continue; }
reading = data.reading;
}
frequency = data.frequency;
}
results.push({
term,
reading,
dictionary,
hasReading,
frequency
});
}
return results;
}
// Find terms internal implementation
async _findTermsInternal(text, enabledDictionaryMap, options) {
@ -1035,7 +1082,20 @@ class Translator {
}
_createTermDefinition(index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries) {
return {index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries};
return {
index,
headwordIndices,
dictionary,
dictionaryIndex,
dictionaryPriority,
id,
score,
frequencyOrder: 0,
sequences,
isPrimary,
tags,
entries
};
}
_createTermPronunciation(index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, pitches) {
@ -1052,6 +1112,7 @@ class Translator {
isPrimary,
inflections,
score,
frequencyOrder: 0,
dictionaryIndex,
dictionaryPriority,
sourceTermExactMatchCount,
@ -1314,6 +1375,10 @@ class Translator {
i = v2.dictionaryPriority - v1.dictionaryPriority;
if (i !== 0) { return i; }
// Sort by frequency order
i = v1.frequencyOrder - v2.frequencyOrder;
if (i !== 0) { return i; }
// Sort by term score
i = v2.score - v1.score;
if (i !== 0) { return i; }
@ -1345,6 +1410,10 @@ class Translator {
let i = v2.dictionaryPriority - v1.dictionaryPriority;
if (i !== 0) { return i; }
// Sort by frequency order
i = v1.frequencyOrder - v2.frequencyOrder;
if (i !== 0) { return i; }
// Sort by term score
i = v2.score - v1.score;
if (i !== 0) { return i; }
@ -1416,4 +1485,43 @@ class Translator {
frequencies.sort(compare);
}
}
_updateSortFrequencies(dictionaryEntries, dictionary, ascending) {
const frequencyMap = new Map();
for (const dictionaryEntry of dictionaryEntries) {
const {definitions, frequencies} = dictionaryEntry;
let frequencyMin = Number.MAX_SAFE_INTEGER;
let frequencyMax = Number.MIN_SAFE_INTEGER;
for (const item of frequencies) {
if (item.dictionary !== dictionary) { continue; }
const {headwordIndex, frequency} = item;
if (typeof frequency !== 'number') { continue; }
frequencyMap.set(headwordIndex, frequency);
frequencyMin = Math.min(frequencyMin, frequency);
frequencyMax = Math.max(frequencyMax, frequency);
}
dictionaryEntry.frequencyOrder = (
frequencyMin <= frequencyMax ?
(ascending ? frequencyMin : -frequencyMax) :
(ascending ? Number.MAX_SAFE_INTEGER : 0)
);
for (const definition of definitions) {
frequencyMin = Number.MAX_SAFE_INTEGER;
frequencyMax = Number.MIN_SAFE_INTEGER;
const {headwordIndices} = definition;
for (const headwordIndex of headwordIndices) {
const frequency = frequencyMap.get(headwordIndex);
if (typeof frequency !== 'number') { continue; }
frequencyMin = Math.min(frequencyMin, frequency);
frequencyMax = Math.max(frequencyMax, frequency);
}
definition.frequencyOrder = (
frequencyMin <= frequencyMax ?
(ascending ? frequencyMin : -frequencyMax) :
(ascending ? Number.MAX_SAFE_INTEGER : 0)
);
}
frequencyMap.clear();
}
}
}

View File

@ -42,6 +42,7 @@
* SentenceTerminationCharactersController
* SettingsController
* SettingsDisplayController
* SortFrequencyDictionaryController
* StatusFooter
* StorageController
* TranslationTextReplacementsController
@ -167,6 +168,9 @@ async function setupGenericSettingsController(genericSettingController) {
const collapsibleDictionaryController = new CollapsibleDictionaryController(settingsController);
collapsibleDictionaryController.prepare();
const sortFrequencyDictionaryController = new SortFrequencyDictionaryController(settingsController);
sortFrequencyDictionaryController.prepare();
await Promise.all(preparePromises);
document.documentElement.dataset.loaded = 'true';

View File

@ -0,0 +1,169 @@
/*
* Copyright (C) 2021 Yomichan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
class SortFrequencyDictionaryController {
constructor(settingsController) {
this._settingsController = settingsController;
this._sortFrequencyDictionarySelect = null;
this._sortFrequencyDictionaryOrderSelect = null;
this._sortFrequencyDictionaryOrderAutoButton = null;
this._sortFrequencyDictionaryOrderContainerNode = null;
this._getDictionaryInfoToken = null;
}
async prepare() {
this._sortFrequencyDictionarySelect = document.querySelector('#sort-frequency-dictionary');
this._sortFrequencyDictionaryOrderSelect = document.querySelector('#sort-frequency-dictionary-order');
this._sortFrequencyDictionaryOrderAutoButton = document.querySelector('#sort-frequency-dictionary-order-auto');
this._sortFrequencyDictionaryOrderContainerNode = document.querySelector('#sort-frequency-dictionary-order-container');
await this._onDatabaseUpdated();
yomichan.on('databaseUpdated', this._onDatabaseUpdated.bind(this));
this._settingsController.on('optionsChanged', this._onOptionsChanged.bind(this));
this._sortFrequencyDictionarySelect.addEventListener('change', this._onSortFrequencyDictionarySelectChange.bind(this));
this._sortFrequencyDictionaryOrderSelect.addEventListener('change', this._onSortFrequencyDictionaryOrderSelectChange.bind(this));
this._sortFrequencyDictionaryOrderAutoButton.addEventListener('click', this._onSortFrequencyDictionaryOrderAutoButtonClick.bind(this));
}
// Private
async _onDatabaseUpdated() {
const token = {};
this._getDictionaryInfoToken = token;
const dictionaries = await this._settingsController.getDictionaryInfo();
if (this._getDictionaryInfoToken !== token) { return; }
this._getDictionaryInfoToken = null;
this._updateDictionaryOptions(dictionaries);
const options = await this._settingsController.getOptions();
this._onOptionsChanged({options});
}
_onOptionsChanged({options}) {
const {sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options.general;
this._sortFrequencyDictionarySelect.value = (sortFrequencyDictionary !== null ? sortFrequencyDictionary : '');
this._sortFrequencyDictionaryOrderSelect.value = sortFrequencyDictionaryOrder;
this._sortFrequencyDictionaryOrderContainerNode.hidden = (sortFrequencyDictionary === null);
}
_onSortFrequencyDictionarySelectChange() {
let {value} = this._sortFrequencyDictionarySelect;
if (value === '') { value = null; }
this._setSortFrequencyDictionaryValue(value);
}
_onSortFrequencyDictionaryOrderSelectChange() {
const {value} = this._sortFrequencyDictionaryOrderSelect;
this._setSortFrequencyDictionaryOrderValue(value);
}
_onSortFrequencyDictionaryOrderAutoButtonClick() {
const {value} = this._sortFrequencyDictionarySelect;
if (value === '') { return; }
this._autoUpdateOrder(value);
}
_updateDictionaryOptions(dictionaries) {
const fragment = document.createDocumentFragment();
let option = document.createElement('option');
option.value = '';
option.textContent = 'None';
fragment.appendChild(option);
for (const {title, counts} of dictionaries) {
if (this._dictionaryHasNoFrequencies(counts)) { continue; }
option = document.createElement('option');
option.value = title;
option.textContent = title;
fragment.appendChild(option);
}
this._sortFrequencyDictionarySelect.textContent = '';
this._sortFrequencyDictionarySelect.appendChild(fragment);
}
async _setSortFrequencyDictionaryValue(value) {
this._sortFrequencyDictionaryOrderContainerNode.hidden = (value === null);
await this._settingsController.setProfileSetting('general.sortFrequencyDictionary', value);
if (value !== null) {
await this._autoUpdateOrder(value);
}
}
async _setSortFrequencyDictionaryOrderValue(value) {
await this._settingsController.setProfileSetting('general.sortFrequencyDictionaryOrder', value);
}
async _autoUpdateOrder(dictionary) {
const order = await this._getFrequencyOrder(dictionary);
if (order === 0) { return; }
const value = (order > 0 ? 'descending' : 'ascending');
this._sortFrequencyDictionaryOrderSelect.value = value;
await this._setSortFrequencyDictionaryOrderValue(value);
}
async _getFrequencyOrder(dictionary) {
const moreCommonTerms = ['来る', '言う', '出る', '入る', '方', '男', '女', '今', '何', '時'];
const lessCommonTerms = ['行なう', '論じる', '過す', '行方', '人口', '猫', '犬', '滝', '理', '暁'];
const terms = [...moreCommonTerms, ...lessCommonTerms];
const frequencies = await yomichan.api.getTermFrequencies(
terms.map((term) => ({term, reading: null})),
[dictionary]
);
const termDetails = new Map();
const moreCommonTermDetails = [];
const lessCommonTermDetails = [];
for (const term of moreCommonTerms) {
const details = {hasValue: false, minValue: Number.MAX_SAFE_INTEGER, maxValue: Number.MIN_SAFE_INTEGER};
termDetails.set(term, details);
moreCommonTermDetails.push(details);
}
for (const term of lessCommonTerms) {
const details = {hasValue: false, minValue: Number.MAX_SAFE_INTEGER, maxValue: Number.MIN_SAFE_INTEGER};
termDetails.set(term, details);
lessCommonTermDetails.push(details);
}
for (const {term, frequency} of frequencies) {
if (typeof frequency !== 'number') { continue; }
const details = termDetails.get(term);
if (typeof details === 'undefined') { continue; }
details.minValue = Math.min(details.minValue, frequency);
details.maxValue = Math.max(details.maxValue, frequency);
details.hasValue = true;
}
let result = 0;
for (const details1 of moreCommonTermDetails) {
if (!details1.hasValue) { continue; }
for (const details2 of lessCommonTermDetails) {
if (!details2.hasValue) { continue; }
result += Math.sign(details1.maxValue - details2.minValue) + Math.sign(details1.minValue - details2.maxValue);
}
}
return Math.sign(result);
}
_dictionaryHasNoFrequencies(counts) {
if (typeof counts !== 'object' || counts === null) { return false; }
const {termMeta} = counts;
if (typeof termMeta !== 'object' || termMeta === null) { return false; }
return termMeta.freq <= 0;
}
}

View File

@ -285,6 +285,73 @@
</div></div>
</div>
</div>
<div class="settings-item advanced-only">
<div class="settings-item-inner settings-item-inner-wrappable">
<div class="settings-item-left">
<div class="settings-item-label">Frequency sorting dictionary</div>
<div class="settings-item-description">
Sort results using a frequency dictionary.
<a tabindex="0" class="more-toggle more-only" data-parent-distance="4">More&hellip;</a>
</div>
</div>
<div class="settings-item-right">
<select id="sort-frequency-dictionary"></select>
</div>
</div>
<div class="settings-item-children more" hidden>
<p>
Enabling this option will sort search results using a specific dictionary.
This can be beneficial when using multiple dictionaries which may not have
consistent sorting information.
</p>
<p>
<a tabindex="0" class="more-toggle" data-parent-distance="3">Less&hellip;</a>
</p>
</div>
<div class="settings-item-children settings-item-children-group" id="sort-frequency-dictionary-order-container" hidden>
<div class="settings-item">
<div class="settings-item-inner settings-item-inner-wrappable">
<div class="settings-item-left">
<div class="settings-item-label">
Frequency sorting mode
<a tabindex="0" class="more-toggle more-only" data-parent-distance="4">(?)</a>
</div>
</div>
<div class="settings-item-right">
<div class="horizontal-flex horizontal-flex-nowrap">
<button class="low-emphasis" id="sort-frequency-dictionary-order-auto">Auto</button>
<select id="sort-frequency-dictionary-order">
<option value="descending">Occurrence-based</option>
<option value="ascending">Rank-based</option>
</select>
</div>
</div>
</div>
<div class="settings-item-children more" hidden>
<p>
Dictionary frequency data can be represented in one of two ways:
</p>
<ul>
<li>
<em>Occurrence-based</em>, where the frequency corresponds to a number of occurrences.
Large values indicate a more common term.
</li>
<li>
<em>Rank-based</em>, where the frequency value corresponds to a ranking index.
Smaller values indicate a more common term.
</li>
</ul>
<p>
The correct mode can be determined based on the contents of the dictionary;
the <em>Auto</em> button attempts to auto-detect the correct value.
</p>
<p>
<a tabindex="0" class="more-toggle" data-parent-distance="3">Less&hellip;</a>
</p>
</div>
</div>
</div>
</div>
<div class="settings-item advanced-only"><div class="settings-item-inner settings-item-inner-wrappable">
<div class="settings-item-left">
<div class="settings-item-label">Maximum number of results</div>
@ -3516,6 +3583,7 @@
<script src="/js/pages/settings/sentence-termination-characters-controller.js"></script>
<script src="/js/pages/settings/settings-controller.js"></script>
<script src="/js/pages/settings/settings-display-controller.js"></script>
<script src="/js/pages/settings/sort-frequency-dictionary-controller.js"></script>
<script src="/js/pages/settings/status-footer.js"></script>
<script src="/js/pages/settings/storage-controller.js"></script>
<script src="/js/pages/settings/translation-text-replacements-controller.js"></script>

View File

@ -14,6 +14,8 @@
"default": {
"wildcard": null,
"mainDictionary": "${title}",
"sortFrequencyDictionary": null,
"sortFrequencyDictionaryOrder": "descending",
"removeNonJapaneseCharacters": true,
"convertHalfWidthCharacters": false,
"convertNumericCharacters": false,

File diff suppressed because it is too large Load Diff

View File

@ -302,7 +302,9 @@ function createProfileOptionsUpdatedTestData1() {
popupActionBarVisibility: 'auto',
popupActionBarLocation: 'top',
frequencyDisplayMode: 'split-tags-grouped',
termDisplayMode: 'ruby'
termDisplayMode: 'ruby',
sortFrequencyDictionary: null,
sortFrequencyDictionaryOrder: 'descending'
},
audio: {
enabled: true,
@ -593,7 +595,7 @@ function createOptionsUpdatedTestData1() {
}
],
profileCurrent: 0,
version: 14,
version: 15,
global: {
database: {
prefixWildcardsSupported: false