From efe8b38b66678b23e743e21f24c72dc99849fcb8 Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Tue, 11 Nov 2014 00:52:39 +1000 Subject: [PATCH 1/6] Add WIP gismu validator script. --- validator.py | 251 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100755 validator.py diff --git a/validator.py b/validator.py new file mode 100755 index 0000000..dc479ba --- /dev/null +++ b/validator.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python3 + +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] + +class RESULT(object): + def max(a, b): + if a.getValue() < b.getValue(): + return b + else: + return a + +class PASS(RESULT): + def getName(): + return 'PASS' + def getValue(): + return 0 +class WARNING(RESULT): + def getName(): + return 'WARNING' + def getValue(): + return 1 +class FAIL(RESULT): + def getName(): + return 'FAIL' + def getValue(): + return 2 + +CONSONANTS = ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'z'] +VOWELS = ['a', 'e', 'i', 'o', 'u'] + +def isCorV(ch): + if ch.lower() in CONSONANTS: + return 'c' + if ch.lower() in VOWELS: + return 'v' + if ch.lower() is 'y': + # sort of a consonant + return 'y' + return '#' + +def getFileList(args, realpath=True, skipRegex=None): + import os + initial = [] + if args: + # Start from the arguments + for a in args: + initial.append(a) + else: + # If no arguments are given, start from the current working directory + initial.append(os.getcwd()) + + # Build up the list of files under the initial paths + result = [] + for init in initial: + if os.path.isdir(init): + # Recurse into directories + for root, dirs, files in os.walk(init): + for f in files: + path = os.path.join(root,f) + result.append(path) + else: + result.append(init) + + # Realpath results + if realpath: + result = [os.path.realpath(x) for x in result] + + if skipRegex: + import re + skipRE = re.compile(skipRegex) + result = [x for x in result if not skipRE.match(x) ] + + return sorted(result) + +class YAMLValidator: + def __init__(self): + pass + +class Metrics: + def __init__(self): + self.gismuCount = 0 + self.definitionLang = {} + + def add(self, other): + self.gismuCount += other.gismuCount + for k,v in other.definitionLang.items(): + if k in self.definitionLang.keys(): + self.definitionLang[k] += v + else: + self.definitionLang[k] = v + + def addGismu(self, gismu): + self.gismuCount += 1 + + def addDefLang(self, lang): + if lang in self.definitionLang.keys(): + self.definitionLang[lang] += 1 + else: + self.definitionLang[lang] = 1 + + def print(self): + print("%d gismu" % (self.gismuCount)) + for k,v in self.definitionLang.items(): + print("%d defs in language %s" % (v, k)) + +class GismuValidator(YAMLValidator): + def __init__(self, gismu): + self.gismu = gismu + self.metrics = Metrics() + + def checkGismu(self, ast): + sections = {'word': GismuValidator.checkGismuWord, + 'rafsi': GismuValidator.checkGismuRafsi, + 'examples': GismuValidator.checkGismuExamples, + 'definitions': GismuValidator.checkGismuDefinitions } + result = PASS + for k,v in ast.items(): + sub = sections.get(k) + if sub is not None: + subResult = sub(self, v) + result = RESULT.max(result, subResult) + else: + print("unrecognised section: %s" % (k)) + return result + def checkGismuWord(self, ast): + self.metrics.addGismu(self.gismu) + + # Must match filename + if ast != self.gismu: + "gismu does not match file name" + return FAIL + + # CLL 4.4 - always have five letters + if len(self.gismu) != 5: + "gismu must always have five letters" + return FAIL + + # CLL 4.4 - start with a consonant and end with a single vowel + # CLL 4.4 - always contain exactly one consonant pair + cvString = '' + for l in self.gismu: + cvString += isCorV(l) + if cvString not in ['ccvcv', 'cvccv']: + "gismu form is invalid" + return FAIL + + # TODO: check for conflicting gismu (see CLL 4.14.4) + return PASS + + def checkGismuRafsi(self, ast): + # TODO: check rasfi are well formed + return PASS + + def checkGismuExamples(self, ast): + # TODO: check examples are well formed + return PASS + + def checkGismuDefinitions(self, ast): + for k,v in ast.items(): + self.metrics.addDefLang(k) + # TODO: check definitions are well formed + return PASS + +def gismuFromFilename(gismuFile): + import os + import re + gfile = os.path.basename(gismuFile) + g = re.match('(.....)\.yaml', gfile) + if g: + return g.group(1) + return None + +def validate_gismu(gismuFile): + f = open(gismuFile, 'r') + t = f.read() + import yaml + y = None + try: + y = yaml.load(t) + except yaml.YAMLError as exc: + print(exc) + if hasattr(exc, 'problem_mark'): + mark = exc.problem_mark + print("Error position: (%s:%s)" % (mark.line+1, mark.column+1)) + if y is None: + return FAIL + + g = gismuFromFilename(gismuFile) + gv = GismuValidator(g) + result = gv.checkGismu(y) + return (result, gv.metrics) + +def getGismuDirs(): + import os + root = os.path.dirname(os.path.realpath(__file__)) + return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] + +def main(): + gismu_dirs = getGismuDirs() + count = {PASS: 0, WARNING: 0, FAIL: 0} + total = Metrics() + for x in getFileList(gismu_dirs, skipRegex='.*\.md$'): + (result, metrics) = validate_gismu(x) + print("%s: %s" % (x, result.getName())) + count[result] = count[result] + 1 + total.add(metrics) + + print("Summary:") + for r,c in count.items(): + print("%s %d" % (r.getName(), c)) + total.print() + + exitcode = 0 + if count[FAIL]: + exitcode=1 + import sys + sys.exit(exitcode) + +if __name__ == "__main__": + main() + From dc34e3f2388302f1acc653fc42a771a158ac2785 Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Thu, 20 Nov 2014 01:11:47 +1000 Subject: [PATCH 2/6] Refactor validator. --- pygimste/__init__.py | 0 pygimste/gismu.py | 280 +++++++++++++++++++++++++++++++++++++++++++ validator.py | 214 +++++++++++---------------------- 3 files changed, 348 insertions(+), 146 deletions(-) create mode 100644 pygimste/__init__.py create mode 100755 pygimste/gismu.py diff --git a/pygimste/__init__.py b/pygimste/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pygimste/gismu.py b/pygimste/gismu.py new file mode 100755 index 0000000..0f4f0b5 --- /dev/null +++ b/pygimste/gismu.py @@ -0,0 +1,280 @@ +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +# Morphology utilities +class Morph: + def isCorV(ch): + CONSONANTS = ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'z'] + VOWELS = ['a', 'e', 'i', 'o', 'u'] + if ch.lower() in CONSONANTS: + return 'c' + if ch.lower() in VOWELS: + return 'v' + if ch.lower() is 'y': + # sort of a consonant + return 'y' + if ch is '\'': + return '\'' + if ch is ',': + return ',' + if ch is '.': + return '.' + # character is not in lojban word + return '#' + + def getCVstring(str): + cvString = '' + for l in str: + cvString += Morph.isCorV(l) + return cvString + +class YamlParseException(Exception): + def __init__(self, exc, msg, line=0, col=0): + self.exc = exc + self.message = msg + self.line = line + self.col = col + +class GismuValidationException(Exception): + def __init__(self, gismu, message): + self.gismu = gismu + self.message = message + +class GismuExampleTranslation: + def __init__(self, lang, text): + self.lang = lang + self.text = text + + def getLang(self): + return self.lang + + def getText(self): + return self.text + +class GismuExample: + def __init__(self, jboText): + self.jbo = jboText + # Not a map, because one lang could have multile variant translations + self.translations = [] + + def addTranslation(self, lang, text): + t = GismuExampleTranslation(lang, text) + self.translations.append(t) + + def getLojbanText(self): + return self.jbo + + def getTranslations(self): + return self.translations + +class GismuDefinition: + def __init__(self, lang): + self.lang = lang + self.placeStructure = None + self.notes = [] + self.glosses = [] + + def getLang(self): + return self.lang + + def setPlace(self, place): + self.placeStructure = place + + def addGlosses(self, glosses): + self.glosses.extend(glosses) + + def addNotes(self, notes): + self.notes.extend(notes) + +class Gismu: + def __init__(self, gismu): + # The gismu itself + self.gismu = gismu + + # The yaml text representing this gismu + self.textYaml = None + + # A string representing the consonant/vowel pattern in the gismu. Morphology requires 'cvccv' or 'ccvcv'. + self.gismuCV = Morph.getCVstring(self.gismu) + + # A list of rafsi associated with this gismu + self.rafsi = [] + + self.examples = [] + self.definitions = {} + + + # A list of all possible valid rafsi forms. Not all of these are used. + # Only access this via getPossibleRafsi(). + # None => not cached. + self._rafsiForms = None + + def get(self): + return self.gismu + + def setTextYaml(self, text): + self.textYaml = text + + def addRafsi(self, rafsi): + self.rafsi.append(rafsi) + + # Returns a list of rafsi associated with this gismu + def getRafsi(self): + return self.rafsi + + # Returns a list of all valid rafsi forms. + # This must be is a superset of the rafsi associated with this gismu. + # Not all possible valid forms are associated with this gismu. + def getPossibleRafsi(self): + if self._rafsiForms is None: + g = self.gismu + # CLL 4.6 - the 5-letter-rafsi, and the 4-letter-rafsi + forms = [g, g[0:4]] + if self.gismuCV == 'cvccv': + # CLL 4.6 - valid short rafsi forms for cvccv + forms.extend([ + (g[0] + g[1] + g[2]), + (g[0] + g[1] + g[3]), + (g[0] + g[1] + '\'' + g[4]), + (g[0] + g[1] + g[4]), + (g[2] + g[3] + g[4]), + (g[0] + g[2] + g[1])]) + elif self.gismuCV == 'ccvcv': + # CLL 4.6 - valid short rafsi forms for ccvcv + forms.extend([ + (g[0] + g[2] + g[3]), + (g[1] + g[2] + g[3]), + (g[0] + g[2] + '\'' + g[4]), + (g[0] + g[2] + g[4]), + (g[1] + g[2] + '\'' + g[4]), + (g[1] + g[2] + g[4]), + (g[0] + g[1] + g[2])]) + else: + raise GismuValidationException(self.gismu, "gismu form is invalid: %s" % (self.gismuCV)) + self._rafsiForms = forms + return self._rafsiForms + + def addExample(self, example): + self.examples.append(example) + + def addDefinition(self, definition): + lang = definition.getLang() + if lang not in self.definitions.keys(): + self.definitions[lang] = [] + self.definitions[lang].append(definition) + + def getDefinitions(self): + return self.definitions + + def validateForms(self): + # CLL 4.4 - always have five letters + if len(self.gismu) != 5: + raise GismuValidationException(self.gismu, "gismu must always have five letters") + + # CLL 4.4 - start with a consonant and end with a single vowel + # CLL 4.4 - always contain exactly one consonant pair + if self.gismuCV not in ['ccvcv', 'cvccv']: + raise GismuValidationException(self.gismu, "gismu form is invalid: %s" % (self.gismuCV)) + + shortRafsi = self.getPossibleRafsi() + for r in self.rafsi: + if r not in shortRafsi: + raise GismuValidationException(self.gismu, "rasfi form is invalid: %s" % (r)) + + + def validateExamples(self): + # TODO: check examples are well formed + pass + + def validateDefinitions(self): + # TODO: check definitions are well formed + pass + + def validate(self): + self.validateForms() + self.validateExamples() + self.validateDefinitions() + +def yaml2Gismu(gismu, text): + import yaml + y = None + try: + y = yaml.load(text) + except yaml.YAMLError as exc: + line = 0 + col = 0 + if hasattr(exc, 'problem_mark'): + mark = exc.problem_mark + print("Error position: (%s:%s)" % (mark.line+1, mark.column+1)) + line = mark.line+1 + col = mark.column+1 + raise YamlParseException(exc, str(exc), line, col) + if y is None: + return None + + result = Gismu(gismu) + + if y['word'] != gismu: + raise YamlParseException(None, "gismu word does not match") + + + rafsiData = y['rafsi'] + if rafsiData == ['No rafsi.']: + pass + else: + for r in rafsiData: + result.addRafsi(r) + + for ex in y['examples']: + result.addExample(ex) + + for lang,v in y['definitions'].items(): + gd = GismuDefinition(lang) + gd.setPlace(v['place structure']) + gd.addGlosses(v['glosses']) + gd.addNotes(v['notes']) + result.addDefinition(gd) + + result.validate() + result.setTextYaml(text) + return result + +def Gismu2yaml(g): + if g.getTextYaml(): + return g.getTextYaml() + + import yaml + try: + t = yaml.dump(g) + return t + except yaml.YAMLError as exc: + pass + return None + diff --git a/validator.py b/validator.py index dc479ba..e577147 100755 --- a/validator.py +++ b/validator.py @@ -30,43 +30,10 @@ either expressed or implied, of the FreeBSD Project. """ -PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] +#PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] +PATHS_GISMU_DIRS = ['experimental_gismu'] -class RESULT(object): - def max(a, b): - if a.getValue() < b.getValue(): - return b - else: - return a - -class PASS(RESULT): - def getName(): - return 'PASS' - def getValue(): - return 0 -class WARNING(RESULT): - def getName(): - return 'WARNING' - def getValue(): - return 1 -class FAIL(RESULT): - def getName(): - return 'FAIL' - def getValue(): - return 2 - -CONSONANTS = ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'z'] -VOWELS = ['a', 'e', 'i', 'o', 'u'] - -def isCorV(ch): - if ch.lower() in CONSONANTS: - return 'c' - if ch.lower() in VOWELS: - return 'v' - if ch.lower() is 'y': - # sort of a consonant - return 'y' - return '#' +from pygimste import gismu def getFileList(args, realpath=True, skipRegex=None): import os @@ -102,25 +69,41 @@ def getFileList(args, realpath=True, skipRegex=None): return sorted(result) -class YAMLValidator: - def __init__(self): - pass + +def gismuFromFilename(gismuFile): + import os + import re + gfile = os.path.basename(gismuFile) + g = re.match('(.....)\.yaml', gfile) + if g: + return g.group(1) + return None + +GISMU = {} + +def load_gismu_file(gismuFile): + f = open(gismuFile, 'r') + gStr = gismuFromFilename(gismuFile) + t = f.read() + gObj = gismu.yaml2Gismu(gStr, t) + GISMU[gObj.get()] = gObj + +def getGismuDirs(): + import os + root = os.path.dirname(os.path.realpath(__file__)) + return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] class Metrics: def __init__(self): self.gismuCount = 0 self.definitionLang = {} - def add(self, other): - self.gismuCount += other.gismuCount - for k,v in other.definitionLang.items(): - if k in self.definitionLang.keys(): - self.definitionLang[k] += v - else: - self.definitionLang[k] = v - def addGismu(self, gismu): self.gismuCount += 1 + for lang, defs in gismu.getDefinitions().items(): + for d in defs: + assert(lang == d.getLang()) + self.addDefLang(d.getLang()) def addDefLang(self, lang): if lang in self.definitionLang.keys(): @@ -133,115 +116,54 @@ def print(self): for k,v in self.definitionLang.items(): print("%d defs in language %s" % (v, k)) -class GismuValidator(YAMLValidator): - def __init__(self, gismu): - self.gismu = gismu - self.metrics = Metrics() - - def checkGismu(self, ast): - sections = {'word': GismuValidator.checkGismuWord, - 'rafsi': GismuValidator.checkGismuRafsi, - 'examples': GismuValidator.checkGismuExamples, - 'definitions': GismuValidator.checkGismuDefinitions } - result = PASS - for k,v in ast.items(): - sub = sections.get(k) - if sub is not None: - subResult = sub(self, v) - result = RESULT.max(result, subResult) - else: - print("unrecognised section: %s" % (k)) - return result - def checkGismuWord(self, ast): - self.metrics.addGismu(self.gismu) - - # Must match filename - if ast != self.gismu: - "gismu does not match file name" - return FAIL - - # CLL 4.4 - always have five letters - if len(self.gismu) != 5: - "gismu must always have five letters" - return FAIL - - # CLL 4.4 - start with a consonant and end with a single vowel - # CLL 4.4 - always contain exactly one consonant pair - cvString = '' - for l in self.gismu: - cvString += isCorV(l) - if cvString not in ['ccvcv', 'cvccv']: - "gismu form is invalid" - return FAIL - - # TODO: check for conflicting gismu (see CLL 4.14.4) - return PASS - - def checkGismuRafsi(self, ast): - # TODO: check rasfi are well formed - return PASS - - def checkGismuExamples(self, ast): - # TODO: check examples are well formed - return PASS - - def checkGismuDefinitions(self, ast): - for k,v in ast.items(): - self.metrics.addDefLang(k) - # TODO: check definitions are well formed - return PASS - -def gismuFromFilename(gismuFile): - import os - import re - gfile = os.path.basename(gismuFile) - g = re.match('(.....)\.yaml', gfile) - if g: - return g.group(1) - return None - -def validate_gismu(gismuFile): - f = open(gismuFile, 'r') - t = f.read() - import yaml - y = None - try: - y = yaml.load(t) - except yaml.YAMLError as exc: - print(exc) - if hasattr(exc, 'problem_mark'): - mark = exc.problem_mark - print("Error position: (%s:%s)" % (mark.line+1, mark.column+1)) - if y is None: - return FAIL - - g = gismuFromFilename(gismuFile) - gv = GismuValidator(g) - result = gv.checkGismu(y) - return (result, gv.metrics) +# Validating all gismu as a whole collection +class CollectiveValidator: + def __init__(self): + self.data = {} + self.dataDuplicates = {} -def getGismuDirs(): - import os - root = os.path.dirname(os.path.realpath(__file__)) - return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] + def addGismu(self, gismu): + g = GismuInfo(gismu, rasfi) + if gismu not in self.data.keys(): + self.data[gismu] = g + else: + if gismu not in self.dataDuplicates.keys(): + self.dataDuplicates[gismu] = [] + self.dataDuplicates[gismu].append(self.data.pop(gismu)) + self.dataDuplicates[gismu].append(g) def main(): gismu_dirs = getGismuDirs() - count = {PASS: 0, WARNING: 0, FAIL: 0} - total = Metrics() + countFail = 0 + countProcessed = 0 for x in getFileList(gismu_dirs, skipRegex='.*\.md$'): - (result, metrics) = validate_gismu(x) - print("%s: %s" % (x, result.getName())) - count[result] = count[result] + 1 - total.add(metrics) + countProcessed += 1 + try: + load_gismu_file(x) + except gismu.YamlParseException as exc: + countFail += 1 + print(exc) + except gismu.GismuValidationException as exc: + countFail += 1 + print(exc) + except Exception as exc: + # Unknowon/unexpected + raise exc + if countProcessed % 100 == 0: + print("...loaded %d" % (countProcessed)) + + print("...loaded %d" % (countProcessed)) + + total = Metrics() + for k,v in GISMU.items(): + total.addGismu(v) print("Summary:") - for r,c in count.items(): - print("%s %d" % (r.getName(), c)) + print("%d failed to load" % (countFail)) total.print() exitcode = 0 - if count[FAIL]: + if countFail: exitcode=1 import sys sys.exit(exitcode) From 963559fe9b66f17f011ecd0191b97cac9ed61222 Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Mon, 24 Nov 2014 01:15:07 +1000 Subject: [PATCH 3/6] More validator refactoring. --- pygimste/gismu.py | 6 ++ validator.py | 140 +++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 133 insertions(+), 13 deletions(-) diff --git a/pygimste/gismu.py b/pygimste/gismu.py index 0f4f0b5..6e1f5a5 100755 --- a/pygimste/gismu.py +++ b/pygimste/gismu.py @@ -118,6 +118,9 @@ def __init__(self, gismu): # The gismu itself self.gismu = gismu + # Is this gismu experimental + self.experimental = False + # The yaml text representing this gismu self.textYaml = None @@ -136,6 +139,9 @@ def __init__(self, gismu): # None => not cached. self._rafsiForms = None + def setExperimental(self, experi): + self.experimental = experi + def get(self): return self.gismu diff --git a/validator.py b/validator.py index e577147..8979b35 100755 --- a/validator.py +++ b/validator.py @@ -30,8 +30,8 @@ either expressed or implied, of the FreeBSD Project. """ -#PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] -PATHS_GISMU_DIRS = ['experimental_gismu'] +PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] +#PATHS_GISMU_DIRS = ['experimental_gismu'] from pygimste import gismu @@ -79,6 +79,10 @@ def gismuFromFilename(gismuFile): return g.group(1) return None +def experimentalFromFilename(gismuFile): + import re + return re.search('experimental_gismu', gismuFile) is not None + GISMU = {} def load_gismu_file(gismuFile): @@ -86,6 +90,8 @@ def load_gismu_file(gismuFile): gStr = gismuFromFilename(gismuFile) t = f.read() gObj = gismu.yaml2Gismu(gStr, t) + if experimentalFromFilename(gismuFile): + gObj.setExperimental(True) GISMU[gObj.get()] = gObj def getGismuDirs(): @@ -93,12 +99,29 @@ def getGismuDirs(): root = os.path.dirname(os.path.realpath(__file__)) return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] -class Metrics: +class CollectionVisitor: + def __init__(self): + pass + + def visitGismu(self, word, data): + pass + + def visitCollection(self): + for k,v in GISMU.items(): + self.visitGismu(k,v) + + def print(self): + pass + + def getResult(self): + return True + +class CollectionMetrics(CollectionVisitor): def __init__(self): self.gismuCount = 0 self.definitionLang = {} - def addGismu(self, gismu): + def visitGismu(self, word, gismu): self.gismuCount += 1 for lang, defs in gismu.getDefinitions().items(): for d in defs: @@ -116,14 +139,26 @@ def print(self): for k,v in self.definitionLang.items(): print("%d defs in language %s" % (v, k)) +class GismuInfo: + def __init__(self, word, gismuObj): + pass + # Validating all gismu as a whole collection -class CollectiveValidator: +class CollectionValidator(CollectionVisitor): def __init__(self): self.data = {} self.dataDuplicates = {} + self.setDirty() - def addGismu(self, gismu): - g = GismuInfo(gismu, rasfi) + def setDirty(self): + self.failed = None # None => dirty, need to validate() + self.validationErrors = [] + + def visitGismu(self, gismu, gismuObj): + self.failed = None + + # Cache to find duplicates + g = GismuInfo(gismu, gismuObj) if gismu not in self.data.keys(): self.data[gismu] = g else: @@ -132,6 +167,83 @@ def addGismu(self, gismu): self.dataDuplicates[gismu].append(self.data.pop(gismu)) self.dataDuplicates[gismu].append(g) + def addValidationError(self, ve): + self.failed = True + self.validationErrors.append(ve) + + def validate(self): + if self.failed is not None: + # hasn't changed since the last time we calculated it + return + self.failed = False + self.validationErrors = [] + + # CLL 4.14 - conflicting gismu: identical + if len(self.dataDuplicates.keys()) > 0: + for k in self.dataDuplicates.keys(): + self.addValidationError("duplicate gismu: %s" % (k)) + + # Get sorted list of gismu + glist = [] + glist.extend(self.data.keys()) + glist.extend(self.dataDuplicates.keys()) + glist = sorted(glist) + + # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) + xPrev = None + for x in glist: + if xPrev is None: + continue + else: + if x[0:4] == xPrev[0:4]: + self.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (x, xPrev)) + + SIMILAR_CONSONANT = { + 'b': ['p', 'v'], + 'c': ['j', 's'], + 'd': ['t'], + 'f': ['p', 'v'], + 'g': ['k', 'x'], + 'j': ['c', 'z'], + 'k': ['g', 'x'], + 'l': ['r'], + 'm': ['n'], + 'n': ['m'], + 'p': ['b', 'f'], + 'r': ['l'], + 's': ['c', 'z'], + 't': ['d'], + 'v': ['b', 'f'], + 'x': ['g', 'k'], + 'z': ['j', 's'], + } + + # CLL 4.14 - conflicting gismu: too similar + for xs in glist: + for i in range(0, len(xs)): + replacement = SIMILAR_CONSONANT.get(xs[i], []) + for r in replacement: + xs2 = xs[:i] + r + xs[i+1:] + if xs2 in self.data.keys(): + self.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (xs, xs2, xs[i], r)) + + # Pairwise checks + for i in range(0, len(glist)): + for j in range(i+1, len(glist)): + (glist[i], glist[j]) + + + def print(self): + self.validate() + for ve in self.validationErrors: + print(ve) + + # True = validate ok + def getResult(self): + self.validate() + return not self.failed + + def main(): gismu_dirs = getGismuDirs() countFail = 0 @@ -152,15 +264,17 @@ def main(): if countProcessed % 100 == 0: print("...loaded %d" % (countProcessed)) - print("...loaded %d" % (countProcessed)) + print("...loaded %d (%d failed to load)" % (countProcessed, countFail)) + + cmet = CollectionMetrics() + cmet.visitCollection() - total = Metrics() - for k,v in GISMU.items(): - total.addGismu(v) + cval = CollectionValidator() + cval.visitCollection() print("Summary:") - print("%d failed to load" % (countFail)) - total.print() + cmet.print() + cval.print() exitcode = 0 if countFail: From d2c5e713a6ff74318e07b72505caa91546d9796b Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Wed, 26 Nov 2014 00:38:29 +1000 Subject: [PATCH 4/6] Improve pairwise tests. --- pygimste/gismu.py | 55 +++++++++++++++++++++++ validator.py | 112 +++++++++++++++++++++++++++------------------- 2 files changed, 121 insertions(+), 46 deletions(-) diff --git a/pygimste/gismu.py b/pygimste/gismu.py index 6e1f5a5..d047545 100755 --- a/pygimste/gismu.py +++ b/pygimste/gismu.py @@ -139,6 +139,27 @@ def __init__(self, gismu): # None => not cached. self._rafsiForms = None + def __lt__(self, other): + return self.gismu < other.gismu + def __le__(self, other): + return self.gismu <= other.gismu + def __eq__(self, other): + return self.gismu == other.gismu + def __ge__(self, other): + return self.gismu >= other.gismu + def __gt__(self, other): + return self.gismu > other.gismu + def __ne__(self, other): + return self.gismu != other.gismu + def __hash__(self): + return hash(self.gismu) + + def __str__(self): + result = self.gismu + if self.experimental: + result += ' [experimental]' + return result + def setExperimental(self, experi): self.experimental = experi @@ -187,6 +208,40 @@ def getPossibleRafsi(self): self._rafsiForms = forms return self._rafsiForms + # Return variations on the gismu form that cannot also be gismu because they are too similar + def getSimilarForms(self): + SIMILAR_CONSONANT = { + 'b': ['p', 'v'], + 'c': ['j', 's'], + 'd': ['t'], + 'f': ['p', 'v'], + 'g': ['k', 'x'], + 'j': ['c', 'z'], + 'k': ['g', 'x'], + 'l': ['r'], + 'm': ['n'], + 'n': ['m'], + 'p': ['b', 'f'], + 'r': ['l'], + 's': ['c', 'z'], + 't': ['d'], + 'v': ['b', 'f'], + 'x': ['g', 'k'], + 'z': ['j', 's'], + } + + # CLL 4.14 - conflicting gismu: too similar + result = [] + for i in range(0, len(self.gismu)): + origLetter = self.gismu[i] + replacements = SIMILAR_CONSONANT.get(origLetter, []) + for r in replacements: + similarForm = self.gismu[:i] + r + self.gismu[i+1:] + # Results are tuple of (similar form, letter index, original letter, new letter) + result.append((similarForm, i, origLetter, r)) + return result + + def addExample(self, example): self.examples.append(example) diff --git a/validator.py b/validator.py index 8979b35..8bd8ca9 100755 --- a/validator.py +++ b/validator.py @@ -143,10 +143,55 @@ class GismuInfo: def __init__(self, word, gismuObj): pass +class PairValidator: + def __init__(self): + pass + + def validatePair(self, validator, a, b): + pass + +class ConflictingPairValidator(PairValidator): + def __init__(self): + pass + + def validatePair(self, validator, a, b): + # CLL 4.14 - conflicting gismu: too similar + for aSimilar, aIdx, aLetter, rLetter in a.getSimilarForms(): + if b.get() == aSimilar: + validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(b), str(a), aLetter, rLetter)) + return + for bSimilar, bIdx, bLetter, rLetter in b.getSimilarForms(): + if a.get() == bSimilar: + validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(a), str(b), bLetter, rLetter)) + return + +class ConflictingRafsiValidator(PairValidator): + def __init__(self): + pass + + def validatePair(self, validator, a, b): + # check for gismu that have the same rafsi + intersect = set(a.getRafsi()).intersection(b.getRafsi()) + if intersect: + validator.addValidationError("gismu have same rafsi: %s %s (common rafsi: %s)" % (str(b), str(a), str(intersect))) + return + + +class FinalVowelValidator(PairValidator): + def __init__(self): + pass + + def validatePair(self, validator, a, b): + # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) + if a.get()[0:4] == b.get()[0:4]: + if (a.get()[0:4] == "brod") and (b.get()[0:4] == "brod"): + return + validator.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (str(a), str(b))) + # Validating all gismu as a whole collection class CollectionValidator(CollectionVisitor): def __init__(self): - self.data = {} + self.data = set() self.dataDuplicates = {} self.setDirty() @@ -154,17 +199,16 @@ def setDirty(self): self.failed = None # None => dirty, need to validate() self.validationErrors = [] - def visitGismu(self, gismu, gismuObj): + def visitGismu(self, gismu, g): self.failed = None # Cache to find duplicates - g = GismuInfo(gismu, gismuObj) - if gismu not in self.data.keys(): - self.data[gismu] = g + if g not in self.data: + self.data.add(g) else: if gismu not in self.dataDuplicates.keys(): self.dataDuplicates[gismu] = [] - self.dataDuplicates[gismu].append(self.data.pop(gismu)) + self.dataDuplicates[gismu].append(self.data.remove(gismu)) self.dataDuplicates[gismu].append(g) def addValidationError(self, ve): @@ -185,52 +229,28 @@ def validate(self): # Get sorted list of gismu glist = [] - glist.extend(self.data.keys()) - glist.extend(self.dataDuplicates.keys()) + glist.extend(self.data) + for k,v in self.dataDuplicates.items(): + glist.extend(v) glist = sorted(glist) - # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) - xPrev = None - for x in glist: - if xPrev is None: - continue - else: - if x[0:4] == xPrev[0:4]: - self.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (x, xPrev)) - - SIMILAR_CONSONANT = { - 'b': ['p', 'v'], - 'c': ['j', 's'], - 'd': ['t'], - 'f': ['p', 'v'], - 'g': ['k', 'x'], - 'j': ['c', 'z'], - 'k': ['g', 'x'], - 'l': ['r'], - 'm': ['n'], - 'n': ['m'], - 'p': ['b', 'f'], - 'r': ['l'], - 's': ['c', 'z'], - 't': ['d'], - 'v': ['b', 'f'], - 'x': ['g', 'k'], - 'z': ['j', 's'], - } - - # CLL 4.14 - conflicting gismu: too similar - for xs in glist: - for i in range(0, len(xs)): - replacement = SIMILAR_CONSONANT.get(xs[i], []) - for r in replacement: - xs2 = xs[:i] + r + xs[i+1:] - if xs2 in self.data.keys(): - self.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (xs, xs2, xs[i], r)) + # Adjacent pairwise checks + adjacentPairwiseChecks = [FinalVowelValidator()] + for i in range(1, len(glist)): + a = glist[i-1] + b = glist[i] + for check in adjacentPairwiseChecks: + check.validatePair(self, a, b) # Pairwise checks + pairwiseChecks = [ConflictingPairValidator(), ConflictingRafsiValidator()] for i in range(0, len(glist)): for j in range(i+1, len(glist)): - (glist[i], glist[j]) + a = glist[i] + b = glist[j] + + for check in pairwiseChecks: + check.validatePair(self, a,b) def print(self): From 2d9e99e458c6f74cc6bca121b231056081d7f783 Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Wed, 26 Nov 2014 01:22:17 +1000 Subject: [PATCH 5/6] A little more refactoring. --- validator.py | 59 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/validator.py b/validator.py index 8bd8ca9..81d1f9a 100755 --- a/validator.py +++ b/validator.py @@ -116,6 +116,20 @@ def print(self): def getResult(self): return True +class LevenshteinPairMetric: + # Implementation from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python + def levenshtein(seq1, seq2): + oneago = None + thisrow = range(1, len(seq2) + 1) + [0] + for x in xrange(len(seq1)): + twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + return thisrow[len(seq2) - 1] + class CollectionMetrics(CollectionVisitor): def __init__(self): self.gismuCount = 0 @@ -143,50 +157,49 @@ class GismuInfo: def __init__(self, word, gismuObj): pass -class PairValidator: +class PairVisitor: def __init__(self): pass - def validatePair(self, validator, a, b): + def visitPair(self, a, b): pass -class ConflictingPairValidator(PairValidator): - def __init__(self): - pass +class ConflictingPairValidator(PairVisitor): + def __init__(self, validator): + self.validator = validator - def validatePair(self, validator, a, b): + def visitPair(self, a, b): # CLL 4.14 - conflicting gismu: too similar for aSimilar, aIdx, aLetter, rLetter in a.getSimilarForms(): if b.get() == aSimilar: - validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(b), str(a), aLetter, rLetter)) + self.validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(b), str(a), aLetter, rLetter)) return for bSimilar, bIdx, bLetter, rLetter in b.getSimilarForms(): if a.get() == bSimilar: - validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(a), str(b), bLetter, rLetter)) + self.validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(a), str(b), bLetter, rLetter)) return -class ConflictingRafsiValidator(PairValidator): - def __init__(self): - pass +class ConflictingRafsiValidator(PairVisitor): + def __init__(self, validator): + self.validator = validator - def validatePair(self, validator, a, b): + def visitPair(self, a, b): # check for gismu that have the same rafsi intersect = set(a.getRafsi()).intersection(b.getRafsi()) if intersect: - validator.addValidationError("gismu have same rafsi: %s %s (common rafsi: %s)" % (str(b), str(a), str(intersect))) - return + self.validator.addValidationError("gismu have same rafsi: %s %s (common rafsi: %s)" % (str(b), str(a), str(intersect))) -class FinalVowelValidator(PairValidator): - def __init__(self): - pass +class FinalVowelValidator(PairVisitor): + def __init__(self, validator): + self.validator = validator - def validatePair(self, validator, a, b): + def visitPair(self, a, b): # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) if a.get()[0:4] == b.get()[0:4]: if (a.get()[0:4] == "brod") and (b.get()[0:4] == "brod"): return - validator.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (str(a), str(b))) + self.validator.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (str(a), str(b))) # Validating all gismu as a whole collection class CollectionValidator(CollectionVisitor): @@ -235,22 +248,22 @@ def validate(self): glist = sorted(glist) # Adjacent pairwise checks - adjacentPairwiseChecks = [FinalVowelValidator()] + adjacentPairwiseChecks = [FinalVowelValidator(self)] for i in range(1, len(glist)): a = glist[i-1] b = glist[i] for check in adjacentPairwiseChecks: - check.validatePair(self, a, b) + check.visitPair(a, b) # Pairwise checks - pairwiseChecks = [ConflictingPairValidator(), ConflictingRafsiValidator()] + pairwiseChecks = [ConflictingPairValidator(self), ConflictingRafsiValidator(self)] for i in range(0, len(glist)): for j in range(i+1, len(glist)): a = glist[i] b = glist[j] for check in pairwiseChecks: - check.validatePair(self, a,b) + check.visitPair(a,b) def print(self): From 5d0577bdf719c26176ce97ec097e7d4c853cd436 Mon Sep 17 00:00:00 2001 From: Andrew Browne Date: Sun, 26 Apr 2015 15:06:02 +1000 Subject: [PATCH 6/6] Refactoring. Implement Levenshtein distance. --- pygimste/visitors.py | 177 ++++++++++++++++++++++++++ validator.py | 297 ++++++++++++++++++++++++------------------- 2 files changed, 344 insertions(+), 130 deletions(-) create mode 100644 pygimste/visitors.py diff --git a/pygimste/visitors.py b/pygimste/visitors.py new file mode 100644 index 0000000..bba8b52 --- /dev/null +++ b/pygimste/visitors.py @@ -0,0 +1,177 @@ +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +class VisitorBase: + def __init__(self, name): + self.name = name + + def getName(self): + return self.name + + def start(self): + import datetime + self.startTime = datetime.datetime.now() + print("Running visitor %s ..." % (self.getName()), end="") + + def finish(self): + import datetime + self.finishTime = datetime.datetime.now() + c = self.finishTime - self.startTime + print(" %d ms" % (c.total_seconds() * 1000)) + +class CollectionVisitor(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismu(self, data): + pass + +class CollectionVisitorPair(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuPair(self, a, b): + pass + +class CollectionVisitorAdjacent(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuAdjacent(self, a, b): + pass + +class CollectionVisitorDuplicates(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuDuplicates(self, dupList): + pass + +# Executes CollectionVisitor and CollectionVisitorPair on the collection +class CollectionVisitorManager: + def __init__(self): + self.visitors = [] + self.visitorsPairwise = [] + self.visitorsAdjacent = [] + self.visitorsDuplicates = [] + self.cacheClear() + + # Visitors + + def addVisitor(self, v): + if hasattr(v, 'visitGismu'): + self.visitors.append(v) + if hasattr(v, 'visitGismuPair'): + self.visitorsPairwise.append(v) + if hasattr(v, 'visitGismuAdjacent'): + self.visitorsAdjacent.append(v) + if hasattr(v, 'visitGismuDuplicates'): + self.visitorsDuplicates.append(v) + + def getVisitors(self): + result = [] + result.extend(self.visitors) + result.extend(self.visitorsPairwise) + result.extend(self.visitorsAdjacent) + result.extend(self.visitorsDuplicates) + return result + + def getMetricVisitors(self): + return [v for v in self.getVisitors() if hasattr(v, 'isMetric') and v.isMetric()] + + def getValidatorVisitors(self): + return [v for v in self.getVisitors() if hasattr(v, 'isValidator') and v.isValidator()] + + # Cache + + def cacheClear(self): + self.data = set() + self.dataDuplicates = {} + self.dataSorted = [] + + def cacheCollection(self, main_gismu_list): + self.cacheClear() + for gismu,g in main_gismu_list.items(): + # Cache to find duplicates + if g not in self.data: + self.data.add(g) + else: + if gismu not in self.dataDuplicates.keys(): + self.dataDuplicates[gismu] = [] + self.dataDuplicates[gismu].append(self.data.remove(gismu)) + self.dataDuplicates[gismu].append(g) + + # Get sorted list of gismu + glist = [] + glist.extend(self.data) + for k,v in self.dataDuplicates.items(): + glist.extend(v) + self.dataSorted = sorted(glist) + + # Visiting + + def visit(self, main_gismu_list): + self.cacheCollection(main_gismu_list) + + # Gismu visitors + for check in self.visitors: + check.start() + for g in self.dataSorted: + check.visitGismu(g) + check.finish() + + # Duplicate Gismu visitors + for check in self.visitorsDuplicates: + check.start() + for k,v in self.dataDuplicates.iteritems(): + assert(len(v) > 1) + check.visitGismuDuplicates(v) + check.finish() + + # Adjacent pairwise visitors + for check in self.visitorsAdjacent: + check.start() + for i in range(1, len(self.dataSorted)): + a = self.dataSorted[i-1] + b = self.dataSorted[i] + check.visitGismuAdjacent(a, b) + check.finish() + + # Pairwise visitors + for check in self.visitorsPairwise: + check.start() + for i in range(0, len(self.dataSorted)): + for j in range(i+1, len(self.dataSorted)): + a = self.dataSorted[i] + b = self.dataSorted[j] + check.visitGismuPair(a,b) + check.finish() + + diff --git a/validator.py b/validator.py index 81d1f9a..9dbab4d 100755 --- a/validator.py +++ b/validator.py @@ -34,6 +34,7 @@ #PATHS_GISMU_DIRS = ['experimental_gismu'] from pygimste import gismu +from pygimste import visitors def getFileList(args, realpath=True, skipRegex=None): import os @@ -99,43 +100,57 @@ def getGismuDirs(): root = os.path.dirname(os.path.realpath(__file__)) return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] -class CollectionVisitor: +class GismuValidationError: + def __init__(self, msg, args): + self.msg = msg + self.gismu = args + + def print(self): + gismulist = ', '.join([str(g) for g in self.gismu]) + print("%s: %s" % (self.msg, gismulist)) + +class Validator: def __init__(self): - pass + self.validationErrors = [] - def visitGismu(self, word, data): - pass + def isValidator(self): + return True - def visitCollection(self): - for k,v in GISMU.items(): - self.visitGismu(k,v) + def addValidationError(self, msg, *args): + self.failed = True + self.validationErrors.append(GismuValidationError(msg, args)) def print(self): + for ve in self.validationErrors: + ve.print() + + def isValid(self): + return len(self.validationErrors) == 0 + +class Metric: + def __init__(self, name): + self.name = name pass - def getResult(self): + def isMetric(self): return True -class LevenshteinPairMetric: - # Implementation from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python - def levenshtein(seq1, seq2): - oneago = None - thisrow = range(1, len(seq2) + 1) + [0] - for x in xrange(len(seq1)): - twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] - for y in xrange(len(seq2)): - delcost = oneago[y] + 1 - addcost = thisrow[y - 1] + 1 - subcost = oneago[y - 1] + (seq1[x] != seq2[y]) - thisrow[y] = min(delcost, addcost, subcost) - return thisrow[len(seq2) - 1] + def print(self): + print("---- %s: ----" % (self.name)) + self.printResults() + + def printResults(self): + pass -class CollectionMetrics(CollectionVisitor): +class LanguageMetrics(visitors.CollectionVisitor, Metric): def __init__(self): + name = "Language Counts Metrics" + Metric.__init__(self, name) + visitors.CollectionVisitor.__init__(self, name) self.gismuCount = 0 self.definitionLang = {} - def visitGismu(self, word, gismu): + def visitGismu(self, gismu): self.gismuCount += 1 for lang, defs in gismu.getDefinitions().items(): for d in defs: @@ -148,137 +163,131 @@ def addDefLang(self, lang): else: self.definitionLang[lang] = 1 - def print(self): + def printResults(self): print("%d gismu" % (self.gismuCount)) for k,v in self.definitionLang.items(): print("%d defs in language %s" % (v, k)) -class GismuInfo: - def __init__(self, word, gismuObj): - pass - -class PairVisitor: +class ConflictingPairValidator(visitors.CollectionVisitorPair, Validator): def __init__(self): - pass - - def visitPair(self, a, b): - pass - -class ConflictingPairValidator(PairVisitor): - def __init__(self, validator): - self.validator = validator + name = "Conflicting Gismu Validator" + Validator.__init__(self) + visitors.CollectionVisitorPair.__init__(self, name) - def visitPair(self, a, b): + def visitGismuPair(self, a, b): # CLL 4.14 - conflicting gismu: too similar for aSimilar, aIdx, aLetter, rLetter in a.getSimilarForms(): if b.get() == aSimilar: - self.validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(b), str(a), aLetter, rLetter)) + msg = "gismu too similar (differs by %s to %s)" % (aLetter, rLetter) + self.addValidationError(msg, a, b) return for bSimilar, bIdx, bLetter, rLetter in b.getSimilarForms(): if a.get() == bSimilar: - self.validator.addValidationError("gismu too similar: %s %s (differs by %s to %s)" % (str(a), str(b), bLetter, rLetter)) + msg = "gismu too similar (differs by %s to %s)" % (bLetter, rLetter) + self.addValidationError(msg, a, b) return -class ConflictingRafsiValidator(PairVisitor): - def __init__(self, validator): - self.validator = validator +class ConflictingRafsiValidator(visitors.CollectionVisitorPair, Validator): + def __init__(self): + name = "Conflicting Rafsi Validator" + Validator.__init__(self) + visitors.CollectionVisitorPair.__init__(self, name) - def visitPair(self, a, b): + def visitGismuPair(self, a, b): # check for gismu that have the same rafsi intersect = set(a.getRafsi()).intersection(b.getRafsi()) if intersect: - self.validator.addValidationError("gismu have same rafsi: %s %s (common rafsi: %s)" % (str(b), str(a), str(intersect))) + msg = "gismu have same rafsi (common rafsi: %s)" % (str(intersect)) + self.addValidationError(msg, a, b) -class FinalVowelValidator(PairVisitor): - def __init__(self, validator): - self.validator = validator +class FinalVowelValidator(visitors.CollectionVisitorAdjacent, Validator): + def __init__(self): + name = "Final Vowel Validator" + Validator.__init__(self) + visitors.CollectionVisitorAdjacent.__init__(self, name) - def visitPair(self, a, b): + def visitGismuAdjacent(self, a, b): # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) if a.get()[0:4] == b.get()[0:4]: if (a.get()[0:4] == "brod") and (b.get()[0:4] == "brod"): return - self.validator.addValidationError("no two gismu can differ only in the final vowel: %s %s" % (str(a), str(b))) + msg = "gismu only differ in final vowel" % (str(a), str(b)) + self.addValidationError(msg, a, b) -# Validating all gismu as a whole collection -class CollectionValidator(CollectionVisitor): +class DuplicateValidator(visitors.CollectionVisitorDuplicates, Validator): def __init__(self): - self.data = set() - self.dataDuplicates = {} - self.setDirty() - - def setDirty(self): - self.failed = None # None => dirty, need to validate() - self.validationErrors = [] - - def visitGismu(self, gismu, g): - self.failed = None - - # Cache to find duplicates - if g not in self.data: - self.data.add(g) - else: - if gismu not in self.dataDuplicates.keys(): - self.dataDuplicates[gismu] = [] - self.dataDuplicates[gismu].append(self.data.remove(gismu)) - self.dataDuplicates[gismu].append(g) - - def addValidationError(self, ve): - self.failed = True - self.validationErrors.append(ve) - - def validate(self): - if self.failed is not None: - # hasn't changed since the last time we calculated it - return - self.failed = False - self.validationErrors = [] + name = "Duplicate Validator" + Validator.__init__(self) + visitors.CollectionVisitorDuplicates.__init__(self, name) + def visitGismuDuplicates(self, dupList): # CLL 4.14 - conflicting gismu: identical - if len(self.dataDuplicates.keys()) > 0: - for k in self.dataDuplicates.keys(): - self.addValidationError("duplicate gismu: %s" % (k)) - - # Get sorted list of gismu - glist = [] - glist.extend(self.data) - for k,v in self.dataDuplicates.items(): - glist.extend(v) - glist = sorted(glist) - - # Adjacent pairwise checks - adjacentPairwiseChecks = [FinalVowelValidator(self)] - for i in range(1, len(glist)): - a = glist[i-1] - b = glist[i] - for check in adjacentPairwiseChecks: - check.visitPair(a, b) - - # Pairwise checks - pairwiseChecks = [ConflictingPairValidator(self), ConflictingRafsiValidator(self)] - for i in range(0, len(glist)): - for j in range(i+1, len(glist)): - a = glist[i] - b = glist[j] - - for check in pairwiseChecks: - check.visitPair(a,b) - + msg = "gismu duplicate form" + self.addValidationError(msg, *dupList) - def print(self): - self.validate() - for ve in self.validationErrors: - print(ve) +class LevenshteinPairMetric(visitors.CollectionVisitorPair, Metric): + def __init__(self): + name = "Smallest Levenshtein Distances Metric" + Metric.__init__(self, name) + visitors.CollectionVisitorPair.__init__(self, name) + self.top = [] + + @staticmethod + def levenshtein1(seq1, seq2): + # Implementation from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python + oneago = None + thisrow = range(1, len(seq2) + 1) + [0] + for x in xrange(len(seq1)): + twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + return thisrow[len(seq2) - 1] - # True = validate ok - def getResult(self): - self.validate() - return not self.failed + @staticmethod + def levenshtein2(seq1, seq2): + # Implementation from http://rosettacode.org/wiki/Levenshtein_distance#Python + from functools import lru_cache + @lru_cache(maxsize=4095) + def ld(s, t): + if not s: return len(t) + if not t: return len(s) + if s[0] == t[0]: return ld(s[1:], t[1:]) + l1 = ld(s, t[1:]) + l2 = ld(s[1:], t) + l3 = ld(s[1:], t[1:]) + return 1 + min(l1, l2, l3) + return ld(seq1, seq2) + + @staticmethod + def levenshtein(seq1, seq2): + return LevenshteinPairMetric.levenshtein2(seq1, seq2) + + def visitGismuPair(self, a, b): + import heapq + ldist = LevenshteinPairMetric.levenshtein(a.get(), b.get()) + heapq.heappush(self.top, (ldist, (a, b))) + + def printResults(self): + import heapq + if True: + # Print all results < 3 + while self.top: + (s, (a,b)) = heapq.heappop(self.top) + if s >= 3: + break + print("%d\t%s to %s" % (s, str(a), str(b))) + else: + # Print top 32 results + small = heapq.nsmallest(32, self.top) + for (s, (a,b)) in small: + print("%d\t%s to %s" % (s, str(a), str(b))) -def main(): - gismu_dirs = getGismuDirs() +def LoadGismu(gismu_dirs): countFail = 0 countProcessed = 0 for x in getFileList(gismu_dirs, skipRegex='.*\.md$'): @@ -297,21 +306,49 @@ def main(): if countProcessed % 100 == 0: print("...loaded %d" % (countProcessed)) - print("...loaded %d (%d failed to load)" % (countProcessed, countFail)) + print("...loaded %d" % (countProcessed)) + print("%d failed to load" % (countFail)) - cmet = CollectionMetrics() - cmet.visitCollection() + return countFail - cval = CollectionValidator() - cval.visitCollection() +def main(): + # Load Gismu + gismu_dirs = getGismuDirs() + countFail = LoadGismu(gismu_dirs) + + # Create visitors + visitMan = visitors.CollectionVisitorManager() + visitMan.addVisitor(ConflictingPairValidator()) + visitMan.addVisitor(ConflictingRafsiValidator()) + visitMan.addVisitor(FinalVowelValidator()) + visitMan.addVisitor(LanguageMetrics()) + #visitMan.addVisitor(LevenshteinPairMetric()) + + # Run visitors + visitMan.visit(GISMU) + + # Output Results + validators = visitMan.getValidatorVisitors() + hasInvalid = False in [v.isValid() for v in validators] + print("==== Validation: ====") + for v in validators: + v.print() + if hasInvalid: + print("FAIL") + else: + print("PASS") - print("Summary:") - cmet.print() - cval.print() + metrics = visitMan.getMetricVisitors() + print("==== Summary: ====") + for m in metrics: + m.print() + # Exit with error code exitcode = 0 if countFail: - exitcode=1 + exitcode = exitcode | 1 + if hasInvalid: + exitcode = exitcode | 2 import sys sys.exit(exitcode)