Merge pull request #346 from toasted-nutbread/dictionary-schemas

Dictionary schemas
This commit is contained in:
toasted-nutbread 2020-02-11 21:21:37 -05:00 committed by GitHub
commit 9c5ad3ea67
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 373 additions and 0 deletions

View File

@ -0,0 +1,40 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"description": "Index file containing information about the data contained in the dictionary.",
"required": [
"title",
"revision"
],
"properties": {
"title": {
"type": "string",
"description": "Title of the dictionary."
},
"revision": {
"type": "string",
"description": "Revision of the dictionary. This value is only used for displaying information."
},
"sequenced": {
"type": "boolean",
"default": false,
"description": "Whether or not this dictionary can be used as the primary dictionary. Primary dictionaries typically contain term/expression definitions."
},
"format": {
"type": "integer",
"description": "Format of data found in the JSON data files."
},
"version": {
"type": "integer",
"description": "Alias for format."
}
},
"anyOf": [
{
"required": ["format"]
},
{
"required": ["version"]
}
]
}

View File

@ -0,0 +1,33 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Data file containing kanji information.",
"additionalItems": {
"type": "array",
"description": "Information about a single kanji character.",
"minItems": 4,
"items": [
{
"type": "string",
"description": "Kanji character.",
"minLength": 1
},
{
"type": "string",
"description": "String of space-separated onyomi readings for the kanji character. An empty string is treated as no readings."
},
{
"type": "string",
"description": "String of space-separated kunyomi readings for the kanji character. An empty string is treated as no readings."
},
{
"type": "string",
"description": "String of space-separated tags for the kanji character. An empty string is treated as no tags."
}
],
"additionalItems": {
"type": "string",
"description": "A meaning for the kanji character."
}
}
}

View File

@ -0,0 +1,44 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Data file containing kanji information.",
"additionalItems": {
"type": "array",
"description": "Information about a single kanji character.",
"minItems": 6,
"items": [
{
"type": "string",
"description": "Kanji character.",
"minLength": 1
},
{
"type": "string",
"description": "String of space-separated onyomi readings for the kanji character. An empty string is treated as no readings."
},
{
"type": "string",
"description": "String of space-separated kunyomi readings for the kanji character. An empty string is treated as no readings."
},
{
"type": "string",
"description": "String of space-separated tags for the kanji character. An empty string is treated as no tags."
},
{
"type": "array",
"description": "Array of meanings for the kanji character.",
"items": {
"type": "string",
"description": "A meaning for the kanji character."
}
},
{
"type": "object",
"description": "Various stats for the kanji character.",
"additionalProperties": {
"type": "string"
}
}
]
}
}

View File

@ -0,0 +1,25 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Custom metadata for kanji characters.",
"additionalItems": {
"type": "array",
"description": "Metadata about a single kanji character.",
"minItems": 3,
"items": [
{
"type": "string",
"minLength": 1
},
{
"type": "string",
"enum": ["freq"],
"description": "Type of data. \"freq\" corresponds to frequency information."
},
{
"type": ["string", "number"],
"description": "Data for the character."
}
]
}
}

View File

@ -0,0 +1,32 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Data file containing tag information for terms and kanji.",
"additionalItems": {
"type": "array",
"description": "Information about a single tag.",
"minItems": 5,
"items": [
{
"type": "string",
"description": "Tag name."
},
{
"type": "string",
"description": "Category for the tag."
},
{
"type": "number",
"description": "Sorting order for the tag."
},
{
"type": "string",
"description": "Notes for the tag."
},
{
"type": "number",
"description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results."
}
]
}
}

View File

@ -0,0 +1,36 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Data file containing term and expression information.",
"additionalItems": {
"type": "array",
"description": "Information about a single term/expression.",
"minItems": 5,
"items": [
{
"type": "string",
"description": "Term or expression."
},
{
"type": "string",
"description": "Reading of the term/expression, or an empty string if the reading is the same as the term/expression."
},
{
"type": ["string", "null"],
"description": "String of space-separated tags for the definition. An empty string is treated as no tags."
},
{
"type": "string",
"description": "String of space-separated rule identifiers for the definition which is used to validate delinflection. Valid rule identifiers are: v1: ichidan verb; v5: godan verb; vs: suru verb; vk: kuru verb; adj-i: i-adjective. An empty string corresponds to words which aren't inflected, such as nouns."
},
{
"type": "number",
"description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results."
}
],
"additionalItems": {
"type": "string",
"description": "Single definition for the term/expression."
}
}
}

View File

@ -0,0 +1,48 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Data file containing term and expression information.",
"additionalItems": {
"type": "array",
"description": "Information about a single term/expression.",
"minItems": 8,
"items": [
{
"type": "string",
"description": "Term or expression."
},
{
"type": "string",
"description": "Reading of the term/expression, or an empty string if the reading is the same as the term/expression."
},
{
"type": ["string", "null"],
"description": "String of space-separated tags for the definition. An empty string is treated as no tags."
},
{
"type": "string",
"description": "String of space-separated rule identifiers for the definition which is used to validate delinflection. Valid rule identifiers are: v1: ichidan verb; v5: godan verb; vs: suru verb; vk: kuru verb; adj-i: i-adjective. An empty string corresponds to words which aren't inflected, such as nouns."
},
{
"type": "number",
"description": "Score used to determine popularity. Negative values are more rare and positive values are more frequent. This score is also used to sort search results."
},
{
"type": "array",
"description": "Array of definitions for the term/expression.",
"items": {
"type": "string",
"description": "Single definition for the term/expression."
}
},
{
"type": "integer",
"description": "Sequence number for the term/expression. Terms/expressions with the same sequence number can be shown together when the \"resultOutputMode\" option is set to \"merge\"."
},
{
"type": "string",
"description": "String of space-separated tags for the term/expression. An empty string is treated as no tags."
}
]
}
}

View File

@ -0,0 +1,25 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "array",
"description": "Custom metadata for terms/expressions.",
"additionalItems": {
"type": "array",
"description": "Metadata about a single term/expression.",
"minItems": 3,
"items": [
{
"type": "string",
"description": "Term or expression."
},
{
"type": "string",
"enum": ["freq"],
"description": "Type of data. \"freq\" corresponds to frequency information."
},
{
"type": ["string", "number"],
"description": "Data for the term/expression."
}
]
}
}

View File

@ -0,0 +1,90 @@
const fs = require('fs');
const path = require('path');
process.noDeprecation = true; // Suppress a warning about JSZip
const JSZip = require(path.join(__dirname, '../ext/mixed/lib/jszip.min.js'));
process.noDeprecation = false;
const jsonSchemaFileName = path.join(__dirname, '../ext/bg/js/json-schema.js');
const jsonSchemaFileSource = fs.readFileSync(jsonSchemaFileName, {encoding: 'utf8'});
const JsonSchema = Function(`'use strict';${jsonSchemaFileSource};return JsonSchema;`)();
function readSchema(relativeFileName) {
const fileName = path.join(__dirname, relativeFileName);
const source = fs.readFileSync(fileName, {encoding: 'utf8'});
return JSON.parse(source);
}
async function validateDictionaryBanks(zip, fileNameFormat, schema) {
let index = 1;
while (true) {
const fileName = fileNameFormat.replace(/%s/, index);
const file = zip.files[fileName];
if (!file) { break; }
const data = JSON.parse(await file.async('string'));
JsonSchema.validate(data, schema);
++index;
}
}
async function validateDictionary(fileName, schemas) {
const source = fs.readFileSync(fileName);
const zip = await JSZip.loadAsync(source);
const indexFile = zip.files['index.json'];
if (!indexFile) {
throw new Error('No dictionary index found in archive');
}
const index = JSON.parse(await indexFile.async('string'));
const version = index.format || index.version;
JsonSchema.validate(index, schemas.index);
await validateDictionaryBanks(zip, 'term_bank_%s.json', version === 1 ? schemas.termBankV1 : schemas.termBankV3);
await validateDictionaryBanks(zip, 'term_meta_bank_%s.json', schemas.termMetaBankV3);
await validateDictionaryBanks(zip, 'kanji_bank_%s.json', version === 1 ? schemas.kanjiBankV1 : schemas.kanjiBankV3);
await validateDictionaryBanks(zip, 'kanji_meta_bank_%s.json', schemas.kanjiMetaBankV3);
await validateDictionaryBanks(zip, 'tag_bank_%s.json', schemas.tagBankV3);
}
async function main() {
const dictionaryFileNames = process.argv.slice(2);
if (dictionaryFileNames.length === 0) {
console.log([
'Usage:',
' node dictionary-validate <dictionary-file-names>...'
].join('\n'));
return;
}
const schemas = {
index: readSchema('../ext/bg/data/dictionary-index-schema.json'),
kanjiBankV1: readSchema('../ext/bg/data/dictionary-kanji-bank-v1-schema.json'),
kanjiBankV3: readSchema('../ext/bg/data/dictionary-kanji-bank-v3-schema.json'),
kanjiMetaBankV3: readSchema('../ext/bg/data/dictionary-kanji-meta-bank-v3-schema.json'),
tagBankV3: readSchema('../ext/bg/data/dictionary-tag-bank-v3-schema.json'),
termBankV1: readSchema('../ext/bg/data/dictionary-term-bank-v1-schema.json'),
termBankV3: readSchema('../ext/bg/data/dictionary-term-bank-v3-schema.json'),
termMetaBankV3: readSchema('../ext/bg/data/dictionary-term-meta-bank-v3-schema.json')
};
for (const dictionaryFileName of dictionaryFileNames) {
try {
console.log(`Validating ${dictionaryFileName}...`);
await validateDictionary(dictionaryFileName, schemas);
console.log('No issues found');
} catch (e) {
console.warn(e);
}
}
}
main();