Fixing dictionary generation
This commit is contained in:
parent
18720aa150
commit
52fca7c5a5
@ -89,6 +89,7 @@ PARSED_TAGS = {
|
|||||||
'obsc': 'obscure term',
|
'obsc': 'obscure term',
|
||||||
'ok': 'out-dated or obsolete kana usage',
|
'ok': 'out-dated or obsolete kana usage',
|
||||||
'on-mim': 'onomatopoeic or mimetic word',
|
'on-mim': 'onomatopoeic or mimetic word',
|
||||||
|
'P': 'popular term',
|
||||||
'p': 'place-name',
|
'p': 'place-name',
|
||||||
'physics': 'physics terminology',
|
'physics': 'physics terminology',
|
||||||
'pn': 'pronoun',
|
'pn': 'pronoun',
|
||||||
@ -177,15 +178,18 @@ def parse_edict(path):
|
|||||||
reading = None if reading_match is None else reading_match.group(1)
|
reading = None if reading_match is None else reading_match.group(1)
|
||||||
|
|
||||||
defs = []
|
defs = []
|
||||||
tags = []
|
tags = set()
|
||||||
|
|
||||||
for index, dfn in enumerate(filter(None, segments[1:])):
|
for index, dfn in enumerate(filter(None, segments[1:])):
|
||||||
dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn)
|
dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn)
|
||||||
gloss = dfn_match.group(2)
|
|
||||||
|
gloss = dfn_match.group(2).strip()
|
||||||
|
if len(gloss) == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
if index == 0:
|
|
||||||
tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
|
tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
|
||||||
tags = tags_raw.intersection(set(PARSED_TAGS.keys()))
|
tags_raw = tags_raw.intersection(set(PARSED_TAGS.keys()))
|
||||||
|
tags = tags.union(tags_raw)
|
||||||
|
|
||||||
if index == 0 or len(dfn_match.group(1)) > 0:
|
if index == 0 or len(dfn_match.group(1)) > 0:
|
||||||
defs.append([gloss])
|
defs.append([gloss])
|
||||||
|
Loading…
Reference in New Issue
Block a user