From 4b17e79cb82c7c4348ada090f95ea484effe36c2 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sun, 26 Jan 2020 15:06:42 -0500 Subject: [PATCH 1/2] Add schemas for dictionary data --- ext/bg/data/dictionary-index-schema.json | 40 ++++++++++++++++ .../data/dictionary-kanji-bank-v1-schema.json | 33 +++++++++++++ .../data/dictionary-kanji-bank-v3-schema.json | 44 +++++++++++++++++ .../dictionary-kanji-meta-bank-v3-schema.json | 25 ++++++++++ .../data/dictionary-tag-bank-v3-schema.json | 32 +++++++++++++ .../data/dictionary-term-bank-v1-schema.json | 36 ++++++++++++++ .../data/dictionary-term-bank-v3-schema.json | 48 +++++++++++++++++++ .../dictionary-term-meta-bank-v3-schema.json | 25 ++++++++++ 8 files changed, 283 insertions(+) create mode 100644 ext/bg/data/dictionary-index-schema.json create mode 100644 ext/bg/data/dictionary-kanji-bank-v1-schema.json create mode 100644 ext/bg/data/dictionary-kanji-bank-v3-schema.json create mode 100644 ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json create mode 100644 ext/bg/data/dictionary-tag-bank-v3-schema.json create mode 100644 ext/bg/data/dictionary-term-bank-v1-schema.json create mode 100644 ext/bg/data/dictionary-term-bank-v3-schema.json create mode 100644 ext/bg/data/dictionary-term-meta-bank-v3-schema.json diff --git a/ext/bg/data/dictionary-index-schema.json b/ext/bg/data/dictionary-index-schema.json new file mode 100644 index 00000000..9865fcc1 --- /dev/null +++ b/ext/bg/data/dictionary-index-schema.json @@ -0,0 +1,40 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "description": "Index file containing information about the data contained in the dictionary.", + "required": [ + "title", + "revision" + ], + "properties": { + "title": { + "type": "string", + "description": "Title of the dictionary." + }, + "revision": { + "type": "string", + "description": "Revision of the dictionary. This value is only used for displaying information." + }, + "sequenced": { + "type": "boolean", + "default": false, + "description": "Whether or not this dictionary can be used as the primary dictionary. Primary dictionaries typically contain term/expression definitions." + }, + "format": { + "type": "integer", + "description": "Format of data found in the JSON data files." + }, + "version": { + "type": "integer", + "description": "Alias for format." + } + }, + "anyOf": [ + { + "required": ["format"] + }, + { + "required": ["version"] + } + ] +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-kanji-bank-v1-schema.json b/ext/bg/data/dictionary-kanji-bank-v1-schema.json new file mode 100644 index 00000000..6dad5a7a --- /dev/null +++ b/ext/bg/data/dictionary-kanji-bank-v1-schema.json @@ -0,0 +1,33 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Data file containing kanji information.", + "additionalItems": { + "type": "array", + "description": "Information about a single kanji character.", + "minItems": 4, + "items": [ + { + "type": "string", + "description": "Kanji character.", + "minLength": 1 + }, + { + "type": "string", + "description": "String of space-separated onyomi readings for the kanji character. An empty string is treated as no readings." + }, + { + "type": "string", + "description": "String of space-separated kunyomi readings for the kanji character. An empty string is treated as no readings." + }, + { + "type": "string", + "description": "String of space-separated tags for the kanji character. An empty string is treated as no tags." + } + ], + "additionalItems": { + "type": "string", + "description": "A meaning for the kanji character." + } + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-kanji-bank-v3-schema.json b/ext/bg/data/dictionary-kanji-bank-v3-schema.json new file mode 100644 index 00000000..a5b82039 --- /dev/null +++ b/ext/bg/data/dictionary-kanji-bank-v3-schema.json @@ -0,0 +1,44 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Data file containing kanji information.", + "additionalItems": { + "type": "array", + "description": "Information about a single kanji character.", + "minItems": 6, + "items": [ + { + "type": "string", + "description": "Kanji character.", + "minLength": 1 + }, + { + "type": "string", + "description": "String of space-separated onyomi readings for the kanji character. An empty string is treated as no readings." + }, + { + "type": "string", + "description": "String of space-separated kunyomi readings for the kanji character. An empty string is treated as no readings." + }, + { + "type": "string", + "description": "String of space-separated tags for the kanji character. An empty string is treated as no tags." + }, + { + "type": "array", + "description": "Array of meanings for the kanji character.", + "items": { + "type": "string", + "description": "A meaning for the kanji character." + } + }, + { + "type": "object", + "description": "Various stats for the kanji character.", + "additionalProperties": { + "type": "string" + } + } + ] + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json b/ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json new file mode 100644 index 00000000..62479026 --- /dev/null +++ b/ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Custom metadata for kanji characters.", + "additionalItems": { + "type": "array", + "description": "Metadata about a single kanji character.", + "minItems": 3, + "items": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "string", + "enum": ["freq"], + "description": "Type of data. \"freq\" corresponds to frequency information." + }, + { + "type": ["string", "number"], + "description": "Data for the character." + } + ] + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-tag-bank-v3-schema.json b/ext/bg/data/dictionary-tag-bank-v3-schema.json new file mode 100644 index 00000000..ee5ca64d --- /dev/null +++ b/ext/bg/data/dictionary-tag-bank-v3-schema.json @@ -0,0 +1,32 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Data file containing tag information for terms and kanji.", + "additionalItems": { + "type": "array", + "description": "Information about a single tag.", + "minItems": 5, + "items": [ + { + "type": "string", + "description": "Tag name." + }, + { + "type": "string", + "description": "Category for the tag." + }, + { + "type": "number", + "description": "Sorting order for the tag." + }, + { + "type": "string", + "description": "Notes for the tag." + }, + { + "type": "number", + "description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results." + } + ] + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-term-bank-v1-schema.json b/ext/bg/data/dictionary-term-bank-v1-schema.json new file mode 100644 index 00000000..6ffb26e6 --- /dev/null +++ b/ext/bg/data/dictionary-term-bank-v1-schema.json @@ -0,0 +1,36 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Data file containing term and expression information.", + "additionalItems": { + "type": "array", + "description": "Information about a single term/expression.", + "minItems": 5, + "items": [ + { + "type": "string", + "description": "Term or expression." + }, + { + "type": "string", + "description": "Reading of the term/expression, or an empty string if the reading is the same as the term/expression." + }, + { + "type": ["string", "null"], + "description": "String of space-separated tags for the definition. An empty string is treated as no tags." + }, + { + "type": "string", + "description": "String of space-separated rule identifiers for the definition which is used to validate delinflection. Valid rule identifiers are: v1: ichidan verb; v5: godan verb; vs: suru verb; vk: kuru verb; adj-i: i-adjective. An empty string corresponds to words which aren't inflected, such as nouns." + }, + { + "type": "number", + "description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results." + } + ], + "additionalItems": { + "type": "string", + "description": "Single definition for the term/expression." + } + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-term-bank-v3-schema.json b/ext/bg/data/dictionary-term-bank-v3-schema.json new file mode 100644 index 00000000..bb982e36 --- /dev/null +++ b/ext/bg/data/dictionary-term-bank-v3-schema.json @@ -0,0 +1,48 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Data file containing term and expression information.", + "additionalItems": { + "type": "array", + "description": "Information about a single term/expression.", + "minItems": 8, + "items": [ + { + "type": "string", + "description": "Term or expression." + }, + { + "type": "string", + "description": "Reading of the term/expression, or an empty string if the reading is the same as the term/expression." + }, + { + "type": ["string", "null"], + "description": "String of space-separated tags for the definition. An empty string is treated as no tags." + }, + { + "type": "string", + "description": "String of space-separated rule identifiers for the definition which is used to validate delinflection. Valid rule identifiers are: v1: ichidan verb; v5: godan verb; vs: suru verb; vk: kuru verb; adj-i: i-adjective. An empty string corresponds to words which aren't inflected, such as nouns." + }, + { + "type": "number", + "description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results." + }, + { + "type": "array", + "description": "Array of definitions for the term/expression.", + "items": { + "type": "string", + "description": "Single definition for the term/expression." + } + }, + { + "type": "integer", + "description": "Sequence number for the term/expression. Terms/expressions with the same sequence number can be shown together when the \"resultOutputMode\" option is set to \"merge\"." + }, + { + "type": "string", + "description": "String of space-separated tags for the term/expression. An empty string is treated as no tags." + } + ] + } +} \ No newline at end of file diff --git a/ext/bg/data/dictionary-term-meta-bank-v3-schema.json b/ext/bg/data/dictionary-term-meta-bank-v3-schema.json new file mode 100644 index 00000000..1cc0557f --- /dev/null +++ b/ext/bg/data/dictionary-term-meta-bank-v3-schema.json @@ -0,0 +1,25 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "array", + "description": "Custom metadata for terms/expressions.", + "additionalItems": { + "type": "array", + "description": "Metadata about a single term/expression.", + "minItems": 3, + "items": [ + { + "type": "string", + "description": "Term or expression." + }, + { + "type": "string", + "enum": ["freq"], + "description": "Type of data. \"freq\" corresponds to frequency information." + }, + { + "type": ["string", "number"], + "description": "Data for the term/expression." + } + ] + } +} \ No newline at end of file From 8733e324ecbe10bcb4bc9f1a0b9568c7f32429d3 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sat, 1 Feb 2020 22:41:02 -0500 Subject: [PATCH 2/2] Create script to validate dictionary files --- test/dictionary-validate.js | 90 +++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 test/dictionary-validate.js diff --git a/test/dictionary-validate.js b/test/dictionary-validate.js new file mode 100644 index 00000000..971c4971 --- /dev/null +++ b/test/dictionary-validate.js @@ -0,0 +1,90 @@ +const fs = require('fs'); +const path = require('path'); + +process.noDeprecation = true; // Suppress a warning about JSZip +const JSZip = require(path.join(__dirname, '../ext/mixed/lib/jszip.min.js')); +process.noDeprecation = false; + +const jsonSchemaFileName = path.join(__dirname, '../ext/bg/js/json-schema.js'); +const jsonSchemaFileSource = fs.readFileSync(jsonSchemaFileName, {encoding: 'utf8'}); +const JsonSchema = Function(`'use strict';${jsonSchemaFileSource};return JsonSchema;`)(); + + +function readSchema(relativeFileName) { + const fileName = path.join(__dirname, relativeFileName); + const source = fs.readFileSync(fileName, {encoding: 'utf8'}); + return JSON.parse(source); +} + + +async function validateDictionaryBanks(zip, fileNameFormat, schema) { + let index = 1; + while (true) { + const fileName = fileNameFormat.replace(/%s/, index); + + const file = zip.files[fileName]; + if (!file) { break; } + + const data = JSON.parse(await file.async('string')); + JsonSchema.validate(data, schema); + + ++index; + } +} + +async function validateDictionary(fileName, schemas) { + const source = fs.readFileSync(fileName); + const zip = await JSZip.loadAsync(source); + + const indexFile = zip.files['index.json']; + if (!indexFile) { + throw new Error('No dictionary index found in archive'); + } + + const index = JSON.parse(await indexFile.async('string')); + const version = index.format || index.version; + + JsonSchema.validate(index, schemas.index); + + await validateDictionaryBanks(zip, 'term_bank_%s.json', version === 1 ? schemas.termBankV1 : schemas.termBankV3); + await validateDictionaryBanks(zip, 'term_meta_bank_%s.json', schemas.termMetaBankV3); + await validateDictionaryBanks(zip, 'kanji_bank_%s.json', version === 1 ? schemas.kanjiBankV1 : schemas.kanjiBankV3); + await validateDictionaryBanks(zip, 'kanji_meta_bank_%s.json', schemas.kanjiMetaBankV3); + await validateDictionaryBanks(zip, 'tag_bank_%s.json', schemas.tagBankV3); +} + + +async function main() { + const dictionaryFileNames = process.argv.slice(2); + if (dictionaryFileNames.length === 0) { + console.log([ + 'Usage:', + ' node dictionary-validate ...' + ].join('\n')); + return; + } + + const schemas = { + index: readSchema('../ext/bg/data/dictionary-index-schema.json'), + kanjiBankV1: readSchema('../ext/bg/data/dictionary-kanji-bank-v1-schema.json'), + kanjiBankV3: readSchema('../ext/bg/data/dictionary-kanji-bank-v3-schema.json'), + kanjiMetaBankV3: readSchema('../ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json'), + tagBankV3: readSchema('../ext/bg/data/dictionary-tag-bank-v3-schema.json'), + termBankV1: readSchema('../ext/bg/data/dictionary-term-bank-v1-schema.json'), + termBankV3: readSchema('../ext/bg/data/dictionary-term-bank-v3-schema.json'), + termMetaBankV3: readSchema('../ext/bg/data/dictionary-term-meta-bank-v3-schema.json') + }; + + for (const dictionaryFileName of dictionaryFileNames) { + try { + console.log(`Validating ${dictionaryFileName}...`); + await validateDictionary(dictionaryFileName, schemas); + console.log('No issues found'); + } catch (e) { + console.warn(e); + } + } +} + + +main();