diff --git a/pygimste/__init__.py b/pygimste/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/pygimste/gismu.py b/pygimste/gismu.py new file mode 100755 index 0000000..d047545 --- /dev/null +++ b/pygimste/gismu.py @@ -0,0 +1,341 @@ +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +# Morphology utilities +class Morph: + def isCorV(ch): + CONSONANTS = ['b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'z'] + VOWELS = ['a', 'e', 'i', 'o', 'u'] + if ch.lower() in CONSONANTS: + return 'c' + if ch.lower() in VOWELS: + return 'v' + if ch.lower() is 'y': + # sort of a consonant + return 'y' + if ch is '\'': + return '\'' + if ch is ',': + return ',' + if ch is '.': + return '.' + # character is not in lojban word + return '#' + + def getCVstring(str): + cvString = '' + for l in str: + cvString += Morph.isCorV(l) + return cvString + +class YamlParseException(Exception): + def __init__(self, exc, msg, line=0, col=0): + self.exc = exc + self.message = msg + self.line = line + self.col = col + +class GismuValidationException(Exception): + def __init__(self, gismu, message): + self.gismu = gismu + self.message = message + +class GismuExampleTranslation: + def __init__(self, lang, text): + self.lang = lang + self.text = text + + def getLang(self): + return self.lang + + def getText(self): + return self.text + +class GismuExample: + def __init__(self, jboText): + self.jbo = jboText + # Not a map, because one lang could have multile variant translations + self.translations = [] + + def addTranslation(self, lang, text): + t = GismuExampleTranslation(lang, text) + self.translations.append(t) + + def getLojbanText(self): + return self.jbo + + def getTranslations(self): + return self.translations + +class GismuDefinition: + def __init__(self, lang): + self.lang = lang + self.placeStructure = None + self.notes = [] + self.glosses = [] + + def getLang(self): + return self.lang + + def setPlace(self, place): + self.placeStructure = place + + def addGlosses(self, glosses): + self.glosses.extend(glosses) + + def addNotes(self, notes): + self.notes.extend(notes) + +class Gismu: + def __init__(self, gismu): + # The gismu itself + self.gismu = gismu + + # Is this gismu experimental + self.experimental = False + + # The yaml text representing this gismu + self.textYaml = None + + # A string representing the consonant/vowel pattern in the gismu. Morphology requires 'cvccv' or 'ccvcv'. + self.gismuCV = Morph.getCVstring(self.gismu) + + # A list of rafsi associated with this gismu + self.rafsi = [] + + self.examples = [] + self.definitions = {} + + + # A list of all possible valid rafsi forms. Not all of these are used. + # Only access this via getPossibleRafsi(). + # None => not cached. + self._rafsiForms = None + + def __lt__(self, other): + return self.gismu < other.gismu + def __le__(self, other): + return self.gismu <= other.gismu + def __eq__(self, other): + return self.gismu == other.gismu + def __ge__(self, other): + return self.gismu >= other.gismu + def __gt__(self, other): + return self.gismu > other.gismu + def __ne__(self, other): + return self.gismu != other.gismu + def __hash__(self): + return hash(self.gismu) + + def __str__(self): + result = self.gismu + if self.experimental: + result += ' [experimental]' + return result + + def setExperimental(self, experi): + self.experimental = experi + + def get(self): + return self.gismu + + def setTextYaml(self, text): + self.textYaml = text + + def addRafsi(self, rafsi): + self.rafsi.append(rafsi) + + # Returns a list of rafsi associated with this gismu + def getRafsi(self): + return self.rafsi + + # Returns a list of all valid rafsi forms. + # This must be is a superset of the rafsi associated with this gismu. + # Not all possible valid forms are associated with this gismu. + def getPossibleRafsi(self): + if self._rafsiForms is None: + g = self.gismu + # CLL 4.6 - the 5-letter-rafsi, and the 4-letter-rafsi + forms = [g, g[0:4]] + if self.gismuCV == 'cvccv': + # CLL 4.6 - valid short rafsi forms for cvccv + forms.extend([ + (g[0] + g[1] + g[2]), + (g[0] + g[1] + g[3]), + (g[0] + g[1] + '\'' + g[4]), + (g[0] + g[1] + g[4]), + (g[2] + g[3] + g[4]), + (g[0] + g[2] + g[1])]) + elif self.gismuCV == 'ccvcv': + # CLL 4.6 - valid short rafsi forms for ccvcv + forms.extend([ + (g[0] + g[2] + g[3]), + (g[1] + g[2] + g[3]), + (g[0] + g[2] + '\'' + g[4]), + (g[0] + g[2] + g[4]), + (g[1] + g[2] + '\'' + g[4]), + (g[1] + g[2] + g[4]), + (g[0] + g[1] + g[2])]) + else: + raise GismuValidationException(self.gismu, "gismu form is invalid: %s" % (self.gismuCV)) + self._rafsiForms = forms + return self._rafsiForms + + # Return variations on the gismu form that cannot also be gismu because they are too similar + def getSimilarForms(self): + SIMILAR_CONSONANT = { + 'b': ['p', 'v'], + 'c': ['j', 's'], + 'd': ['t'], + 'f': ['p', 'v'], + 'g': ['k', 'x'], + 'j': ['c', 'z'], + 'k': ['g', 'x'], + 'l': ['r'], + 'm': ['n'], + 'n': ['m'], + 'p': ['b', 'f'], + 'r': ['l'], + 's': ['c', 'z'], + 't': ['d'], + 'v': ['b', 'f'], + 'x': ['g', 'k'], + 'z': ['j', 's'], + } + + # CLL 4.14 - conflicting gismu: too similar + result = [] + for i in range(0, len(self.gismu)): + origLetter = self.gismu[i] + replacements = SIMILAR_CONSONANT.get(origLetter, []) + for r in replacements: + similarForm = self.gismu[:i] + r + self.gismu[i+1:] + # Results are tuple of (similar form, letter index, original letter, new letter) + result.append((similarForm, i, origLetter, r)) + return result + + + def addExample(self, example): + self.examples.append(example) + + def addDefinition(self, definition): + lang = definition.getLang() + if lang not in self.definitions.keys(): + self.definitions[lang] = [] + self.definitions[lang].append(definition) + + def getDefinitions(self): + return self.definitions + + def validateForms(self): + # CLL 4.4 - always have five letters + if len(self.gismu) != 5: + raise GismuValidationException(self.gismu, "gismu must always have five letters") + + # CLL 4.4 - start with a consonant and end with a single vowel + # CLL 4.4 - always contain exactly one consonant pair + if self.gismuCV not in ['ccvcv', 'cvccv']: + raise GismuValidationException(self.gismu, "gismu form is invalid: %s" % (self.gismuCV)) + + shortRafsi = self.getPossibleRafsi() + for r in self.rafsi: + if r not in shortRafsi: + raise GismuValidationException(self.gismu, "rasfi form is invalid: %s" % (r)) + + + def validateExamples(self): + # TODO: check examples are well formed + pass + + def validateDefinitions(self): + # TODO: check definitions are well formed + pass + + def validate(self): + self.validateForms() + self.validateExamples() + self.validateDefinitions() + +def yaml2Gismu(gismu, text): + import yaml + y = None + try: + y = yaml.load(text) + except yaml.YAMLError as exc: + line = 0 + col = 0 + if hasattr(exc, 'problem_mark'): + mark = exc.problem_mark + print("Error position: (%s:%s)" % (mark.line+1, mark.column+1)) + line = mark.line+1 + col = mark.column+1 + raise YamlParseException(exc, str(exc), line, col) + if y is None: + return None + + result = Gismu(gismu) + + if y['word'] != gismu: + raise YamlParseException(None, "gismu word does not match") + + + rafsiData = y['rafsi'] + if rafsiData == ['No rafsi.']: + pass + else: + for r in rafsiData: + result.addRafsi(r) + + for ex in y['examples']: + result.addExample(ex) + + for lang,v in y['definitions'].items(): + gd = GismuDefinition(lang) + gd.setPlace(v['place structure']) + gd.addGlosses(v['glosses']) + gd.addNotes(v['notes']) + result.addDefinition(gd) + + result.validate() + result.setTextYaml(text) + return result + +def Gismu2yaml(g): + if g.getTextYaml(): + return g.getTextYaml() + + import yaml + try: + t = yaml.dump(g) + return t + except yaml.YAMLError as exc: + pass + return None + diff --git a/pygimste/visitors.py b/pygimste/visitors.py new file mode 100644 index 0000000..bba8b52 --- /dev/null +++ b/pygimste/visitors.py @@ -0,0 +1,177 @@ +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +class VisitorBase: + def __init__(self, name): + self.name = name + + def getName(self): + return self.name + + def start(self): + import datetime + self.startTime = datetime.datetime.now() + print("Running visitor %s ..." % (self.getName()), end="") + + def finish(self): + import datetime + self.finishTime = datetime.datetime.now() + c = self.finishTime - self.startTime + print(" %d ms" % (c.total_seconds() * 1000)) + +class CollectionVisitor(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismu(self, data): + pass + +class CollectionVisitorPair(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuPair(self, a, b): + pass + +class CollectionVisitorAdjacent(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuAdjacent(self, a, b): + pass + +class CollectionVisitorDuplicates(VisitorBase): + def __init__(self, name): + VisitorBase.__init__(self, name) + + def visitGismuDuplicates(self, dupList): + pass + +# Executes CollectionVisitor and CollectionVisitorPair on the collection +class CollectionVisitorManager: + def __init__(self): + self.visitors = [] + self.visitorsPairwise = [] + self.visitorsAdjacent = [] + self.visitorsDuplicates = [] + self.cacheClear() + + # Visitors + + def addVisitor(self, v): + if hasattr(v, 'visitGismu'): + self.visitors.append(v) + if hasattr(v, 'visitGismuPair'): + self.visitorsPairwise.append(v) + if hasattr(v, 'visitGismuAdjacent'): + self.visitorsAdjacent.append(v) + if hasattr(v, 'visitGismuDuplicates'): + self.visitorsDuplicates.append(v) + + def getVisitors(self): + result = [] + result.extend(self.visitors) + result.extend(self.visitorsPairwise) + result.extend(self.visitorsAdjacent) + result.extend(self.visitorsDuplicates) + return result + + def getMetricVisitors(self): + return [v for v in self.getVisitors() if hasattr(v, 'isMetric') and v.isMetric()] + + def getValidatorVisitors(self): + return [v for v in self.getVisitors() if hasattr(v, 'isValidator') and v.isValidator()] + + # Cache + + def cacheClear(self): + self.data = set() + self.dataDuplicates = {} + self.dataSorted = [] + + def cacheCollection(self, main_gismu_list): + self.cacheClear() + for gismu,g in main_gismu_list.items(): + # Cache to find duplicates + if g not in self.data: + self.data.add(g) + else: + if gismu not in self.dataDuplicates.keys(): + self.dataDuplicates[gismu] = [] + self.dataDuplicates[gismu].append(self.data.remove(gismu)) + self.dataDuplicates[gismu].append(g) + + # Get sorted list of gismu + glist = [] + glist.extend(self.data) + for k,v in self.dataDuplicates.items(): + glist.extend(v) + self.dataSorted = sorted(glist) + + # Visiting + + def visit(self, main_gismu_list): + self.cacheCollection(main_gismu_list) + + # Gismu visitors + for check in self.visitors: + check.start() + for g in self.dataSorted: + check.visitGismu(g) + check.finish() + + # Duplicate Gismu visitors + for check in self.visitorsDuplicates: + check.start() + for k,v in self.dataDuplicates.iteritems(): + assert(len(v) > 1) + check.visitGismuDuplicates(v) + check.finish() + + # Adjacent pairwise visitors + for check in self.visitorsAdjacent: + check.start() + for i in range(1, len(self.dataSorted)): + a = self.dataSorted[i-1] + b = self.dataSorted[i] + check.visitGismuAdjacent(a, b) + check.finish() + + # Pairwise visitors + for check in self.visitorsPairwise: + check.start() + for i in range(0, len(self.dataSorted)): + for j in range(i+1, len(self.dataSorted)): + a = self.dataSorted[i] + b = self.dataSorted[j] + check.visitGismuPair(a,b) + check.finish() + + diff --git a/validator.py b/validator.py new file mode 100755 index 0000000..9dbab4d --- /dev/null +++ b/validator.py @@ -0,0 +1,357 @@ +#!/usr/bin/env python3 + +# 2-clause BSD license +""" +Copyright (c) 2014, Andrew Browne +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +The views and conclusions contained in the software and documentation are those +of the authors and should not be interpreted as representing official policies, +either expressed or implied, of the FreeBSD Project. +""" + +PATHS_GISMU_DIRS = ['gismu', 'experimental_gismu'] +#PATHS_GISMU_DIRS = ['experimental_gismu'] + +from pygimste import gismu +from pygimste import visitors + +def getFileList(args, realpath=True, skipRegex=None): + import os + initial = [] + if args: + # Start from the arguments + for a in args: + initial.append(a) + else: + # If no arguments are given, start from the current working directory + initial.append(os.getcwd()) + + # Build up the list of files under the initial paths + result = [] + for init in initial: + if os.path.isdir(init): + # Recurse into directories + for root, dirs, files in os.walk(init): + for f in files: + path = os.path.join(root,f) + result.append(path) + else: + result.append(init) + + # Realpath results + if realpath: + result = [os.path.realpath(x) for x in result] + + if skipRegex: + import re + skipRE = re.compile(skipRegex) + result = [x for x in result if not skipRE.match(x) ] + + return sorted(result) + + +def gismuFromFilename(gismuFile): + import os + import re + gfile = os.path.basename(gismuFile) + g = re.match('(.....)\.yaml', gfile) + if g: + return g.group(1) + return None + +def experimentalFromFilename(gismuFile): + import re + return re.search('experimental_gismu', gismuFile) is not None + +GISMU = {} + +def load_gismu_file(gismuFile): + f = open(gismuFile, 'r') + gStr = gismuFromFilename(gismuFile) + t = f.read() + gObj = gismu.yaml2Gismu(gStr, t) + if experimentalFromFilename(gismuFile): + gObj.setExperimental(True) + GISMU[gObj.get()] = gObj + +def getGismuDirs(): + import os + root = os.path.dirname(os.path.realpath(__file__)) + return [os.path.join(root, d) for d in PATHS_GISMU_DIRS] + +class GismuValidationError: + def __init__(self, msg, args): + self.msg = msg + self.gismu = args + + def print(self): + gismulist = ', '.join([str(g) for g in self.gismu]) + print("%s: %s" % (self.msg, gismulist)) + +class Validator: + def __init__(self): + self.validationErrors = [] + + def isValidator(self): + return True + + def addValidationError(self, msg, *args): + self.failed = True + self.validationErrors.append(GismuValidationError(msg, args)) + + def print(self): + for ve in self.validationErrors: + ve.print() + + def isValid(self): + return len(self.validationErrors) == 0 + +class Metric: + def __init__(self, name): + self.name = name + pass + + def isMetric(self): + return True + + def print(self): + print("---- %s: ----" % (self.name)) + self.printResults() + + def printResults(self): + pass + +class LanguageMetrics(visitors.CollectionVisitor, Metric): + def __init__(self): + name = "Language Counts Metrics" + Metric.__init__(self, name) + visitors.CollectionVisitor.__init__(self, name) + self.gismuCount = 0 + self.definitionLang = {} + + def visitGismu(self, gismu): + self.gismuCount += 1 + for lang, defs in gismu.getDefinitions().items(): + for d in defs: + assert(lang == d.getLang()) + self.addDefLang(d.getLang()) + + def addDefLang(self, lang): + if lang in self.definitionLang.keys(): + self.definitionLang[lang] += 1 + else: + self.definitionLang[lang] = 1 + + def printResults(self): + print("%d gismu" % (self.gismuCount)) + for k,v in self.definitionLang.items(): + print("%d defs in language %s" % (v, k)) + +class ConflictingPairValidator(visitors.CollectionVisitorPair, Validator): + def __init__(self): + name = "Conflicting Gismu Validator" + Validator.__init__(self) + visitors.CollectionVisitorPair.__init__(self, name) + + def visitGismuPair(self, a, b): + # CLL 4.14 - conflicting gismu: too similar + for aSimilar, aIdx, aLetter, rLetter in a.getSimilarForms(): + if b.get() == aSimilar: + msg = "gismu too similar (differs by %s to %s)" % (aLetter, rLetter) + self.addValidationError(msg, a, b) + return + for bSimilar, bIdx, bLetter, rLetter in b.getSimilarForms(): + if a.get() == bSimilar: + msg = "gismu too similar (differs by %s to %s)" % (bLetter, rLetter) + self.addValidationError(msg, a, b) + return + +class ConflictingRafsiValidator(visitors.CollectionVisitorPair, Validator): + def __init__(self): + name = "Conflicting Rafsi Validator" + Validator.__init__(self) + visitors.CollectionVisitorPair.__init__(self, name) + + def visitGismuPair(self, a, b): + # check for gismu that have the same rafsi + intersect = set(a.getRafsi()).intersection(b.getRafsi()) + if intersect: + msg = "gismu have same rafsi (common rafsi: %s)" % (str(intersect)) + self.addValidationError(msg, a, b) + + +class FinalVowelValidator(visitors.CollectionVisitorAdjacent, Validator): + def __init__(self): + name = "Final Vowel Validator" + Validator.__init__(self) + visitors.CollectionVisitorAdjacent.__init__(self, name) + + def visitGismuAdjacent(self, a, b): + # CLL 4.4 - no two gismu differ only in the final vowel (exception: broda, brode, brodi, brodo, and brodu) + if a.get()[0:4] == b.get()[0:4]: + if (a.get()[0:4] == "brod") and (b.get()[0:4] == "brod"): + return + msg = "gismu only differ in final vowel" % (str(a), str(b)) + self.addValidationError(msg, a, b) + +class DuplicateValidator(visitors.CollectionVisitorDuplicates, Validator): + def __init__(self): + name = "Duplicate Validator" + Validator.__init__(self) + visitors.CollectionVisitorDuplicates.__init__(self, name) + + def visitGismuDuplicates(self, dupList): + # CLL 4.14 - conflicting gismu: identical + msg = "gismu duplicate form" + self.addValidationError(msg, *dupList) + +class LevenshteinPairMetric(visitors.CollectionVisitorPair, Metric): + def __init__(self): + name = "Smallest Levenshtein Distances Metric" + Metric.__init__(self, name) + visitors.CollectionVisitorPair.__init__(self, name) + self.top = [] + + @staticmethod + def levenshtein1(seq1, seq2): + # Implementation from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python + oneago = None + thisrow = range(1, len(seq2) + 1) + [0] + for x in xrange(len(seq1)): + twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] + for y in xrange(len(seq2)): + delcost = oneago[y] + 1 + addcost = thisrow[y - 1] + 1 + subcost = oneago[y - 1] + (seq1[x] != seq2[y]) + thisrow[y] = min(delcost, addcost, subcost) + return thisrow[len(seq2) - 1] + + @staticmethod + def levenshtein2(seq1, seq2): + # Implementation from http://rosettacode.org/wiki/Levenshtein_distance#Python + from functools import lru_cache + @lru_cache(maxsize=4095) + def ld(s, t): + if not s: return len(t) + if not t: return len(s) + if s[0] == t[0]: return ld(s[1:], t[1:]) + l1 = ld(s, t[1:]) + l2 = ld(s[1:], t) + l3 = ld(s[1:], t[1:]) + return 1 + min(l1, l2, l3) + return ld(seq1, seq2) + + @staticmethod + def levenshtein(seq1, seq2): + return LevenshteinPairMetric.levenshtein2(seq1, seq2) + + def visitGismuPair(self, a, b): + import heapq + ldist = LevenshteinPairMetric.levenshtein(a.get(), b.get()) + heapq.heappush(self.top, (ldist, (a, b))) + + def printResults(self): + import heapq + if True: + # Print all results < 3 + while self.top: + (s, (a,b)) = heapq.heappop(self.top) + if s >= 3: + break + print("%d\t%s to %s" % (s, str(a), str(b))) + else: + # Print top 32 results + small = heapq.nsmallest(32, self.top) + for (s, (a,b)) in small: + print("%d\t%s to %s" % (s, str(a), str(b))) + + +def LoadGismu(gismu_dirs): + countFail = 0 + countProcessed = 0 + for x in getFileList(gismu_dirs, skipRegex='.*\.md$'): + countProcessed += 1 + try: + load_gismu_file(x) + except gismu.YamlParseException as exc: + countFail += 1 + print(exc) + except gismu.GismuValidationException as exc: + countFail += 1 + print(exc) + except Exception as exc: + # Unknowon/unexpected + raise exc + if countProcessed % 100 == 0: + print("...loaded %d" % (countProcessed)) + + print("...loaded %d" % (countProcessed)) + print("%d failed to load" % (countFail)) + + return countFail + +def main(): + # Load Gismu + gismu_dirs = getGismuDirs() + countFail = LoadGismu(gismu_dirs) + + # Create visitors + visitMan = visitors.CollectionVisitorManager() + visitMan.addVisitor(ConflictingPairValidator()) + visitMan.addVisitor(ConflictingRafsiValidator()) + visitMan.addVisitor(FinalVowelValidator()) + visitMan.addVisitor(LanguageMetrics()) + #visitMan.addVisitor(LevenshteinPairMetric()) + + # Run visitors + visitMan.visit(GISMU) + + # Output Results + validators = visitMan.getValidatorVisitors() + hasInvalid = False in [v.isValid() for v in validators] + print("==== Validation: ====") + for v in validators: + v.print() + if hasInvalid: + print("FAIL") + else: + print("PASS") + + metrics = visitMan.getMetricVisitors() + print("==== Summary: ====") + for m in metrics: + m.print() + + # Exit with error code + exitcode = 0 + if countFail: + exitcode = exitcode | 1 + if hasInvalid: + exitcode = exitcode | 2 + import sys + sys.exit(exitcode) + +if __name__ == "__main__": + main() +