diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..fd4f2b06 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +node_modules +.DS_Store diff --git a/README.md b/README.md index 84cd00c2..e66d29ea 100755 --- a/README.md +++ b/README.md @@ -3,6 +3,9 @@ Typo.js is a JavaScript spellchecker that uses Hunspell-style dictionaries. Usage ===== +Simple Loading +-------------- + To use Typo in a Chrome extension, simply include the typo.js file in your extension's background page, and then initialize the dictionary like so: ```javascript @@ -22,6 +25,33 @@ var Typo = require("typo-js"); var dictionary = new Typo([...]); ``` + +Faster Loading +-------------- + +If you care about memory or cpu usage, you should try this method. + +The above methods load the dictionary from hunspell compatible `.dic` and `.aff` files. But if you are using node.js or are using a bundler that supports `require(...)`, you can load dictionaries for fast and memory efficient zero-copy-ish files that are precomputed using a script + +To load en_US with the included precomputed dictionary files: + +```javascript +var Typo = require("typo-js"); +var dictionary = new Typo(); +dictionary.loadPrecomputed([...]); // Supports most of the same settings as the constructor +``` + +Assuming you installed this as a node module, if you have some other set of `.aff` and `.dic` files, precompute the `.sst` and `.json` files used by the above technique by running: + +`./node_modules/.bin/typo-precompute [en_US|other_code] [path/to/dictionaries]` using your terminal in your project's root folder + +NOTE: The precompute script will require a lot of memory if processing a large dictionary. + + + +Methods +------- + To check if a word is spelled correctly, do this: ```javascript @@ -36,6 +66,9 @@ var array_of_suggestions = dictionary.suggest("mispeling"); // array_of_suggestions == ["misspelling", "dispelling", "misdealing", "misfiling", "misruling"] ``` +Compatibility +------------- + Typo.js has full support for the following Hunspell affix flags: * PFX diff --git a/bin/benchmark.js b/bin/benchmark.js new file mode 100644 index 00000000..aa31b1ea --- /dev/null +++ b/bin/benchmark.js @@ -0,0 +1,39 @@ +#!/usr/bin/env node + +'use strict'; + + +const Typo = require('../src/typo'); + +function time(name, f, iters) { + iters = iters || 1; + + let t = new Date(); + for(var i = 0; i < iters; i++) { + f(); + } + let te = new Date(); + + let elapsed = ((te - t) / 1000) + 's'; + console.log(name, elapsed) +} + +console.log('Dictionary load time'); + +var dict = new Typo(); +time('- regular', () => dict.load('en_US')); + + +var preDict = new Typo(); +time('- precomputed', () => preDict.loadPrecomputed('en_US')); + + +console.log('\n\ndict.check() speed'); +var words = ['hypersensitiveness', "Abbott's", '9th', 'aaraara', "didn't", "he're"]; + +var n = 1000000; +words.map((w) => { + time('- ' + w + ' (reg)', () => dict.check(w), n); + time('- ' + w + ' (pre)', () => preDict.check(w), n); +}) + diff --git a/bin/precompute-dic.js b/bin/precompute-dic.js new file mode 100755 index 00000000..a34c562f --- /dev/null +++ b/bin/precompute-dic.js @@ -0,0 +1,91 @@ +#!/usr/bin/env node +/* + The hunspell '.dic' dictionaries that typo.js take too long to load for most web applications and the default javascript object based word table is very inefficient on memory. + + This is a node.js script for taking an existing dictionary, loading it the regular way and then outputing + + Usage: + - call as ./bin/precompute-dic.js [local_code] [path_to_dictionaries_folder] + + - default usage is equivalent to "./bin/precompute-dic.js en_US ./src/dictionaries" +*/ + +'use strict'; + +const fs = require('fs'); +const path = require('path'); +const sstab = require('sstab'); + +var DICT = process.argv[2] || 'en_US'; +var FOLDER = path.resolve(process.argv[3] || (__dirname + '/../src/dictionaries/')); +var BINSIZE = 12; /**< Number of strings per sst bin */ + +console.log(); + + +var Typo = require('../src/typo', null, null, { dictionaryPath: FOLDER }); +var dict = new Typo(); +dict.load(DICT) + +console.log('# Expanded Words:', Object.keys(dict.dictionaryTable).length) +console.log('Flags:', dict.flags) + +var dt = dict.dictionaryTable; + + +var obj = {}; + +console.log('1/4 Compressing values'); +for(var k in dt) { + if(dt.hasOwnProperty(k)) { + var v = dt[k]? dt[k] : []; + + // Making that that the charset loading correctly + if(k.slice(2) == 'rich' && k[0] == 'Z') { + console.log(k.charCodeAt(1), '==', 'Zürich'.charCodeAt(1)); // Should be 'Zürich' + } + + // Remove rules that have already been applied + for(var i = 0; i < v.length; i++) { + var r = dict.rules[v[i]]; + if(r && (r.type === 'PFX' || r.type === 'SFX')) { + v.splice(i, 1); + i--; + } + } + + v = v.join(''); + + if(typeof(v) !== 'string') { + console.log('Unsupported flags:', typeof(v)); + } + + obj[k] = v; + } +} + + +console.log('2/3 Creating table'); + +var buf = sstab.build(obj); + + +console.log('3/3 Saving'); + +// Generate metadata file +fs.writeFileSync(FOLDER + `/${DICT}/${DICT}.json`, JSON.stringify({ + compoundRuleCodes: dict.compoundRuleCodes, + dictionary: dict.dictionary, + rules: dict.rules, + compoundRules: dict.compoundRules.map((r) => r.toString()), // Regex needs to be explicitly stringified for JSON serialization + compoundRuleCodes: dict.compoundRuleCodes, + replacementTable: dict.replacementTable, + flags: dict.flags, + loaded: dict.loaded +})) + +fs.writeFileSync(FOLDER + `/${DICT}/${DICT}.sst`, buf); + +console.log('Done!'); + + diff --git a/examples/node/precomputed.js b/examples/node/precomputed.js new file mode 100644 index 00000000..d24becab --- /dev/null +++ b/examples/node/precomputed.js @@ -0,0 +1,20 @@ +/** + * Before running, ensure that you have done + * $ npm install typo-js + */ + +var Typo = require("typo-js"); +var dictionary = new Typo(); +dictionary.loadPrecomputed("en_US"); + +var is_spelled_correctly = dictionary.check("mispelled"); + +console.log( "Is 'mispelled' spelled correctly? " + is_spelled_correctly ); + +var is_spelled_correctly = dictionary.check("misspelled"); + +console.log( "Is 'misspelled' spelled correctly? " + is_spelled_correctly ); + +var array_of_suggestions = dictionary.suggest("mispeling"); + +console.log( "Spelling suggestions for 'mispeling': " + array_of_suggestions.join( ', ' ) ); \ No newline at end of file diff --git a/index.js b/index.js new file mode 100644 index 00000000..37ac14ea --- /dev/null +++ b/index.js @@ -0,0 +1,2 @@ + +module.exports = require('./src/typo'); \ No newline at end of file diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 00000000..08c6e821 --- /dev/null +++ b/package-lock.json @@ -0,0 +1,208 @@ +{ + "name": "typo-js", + "version": "1.0.5", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "balanced-match": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", + "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "dev": true + }, + "brace-expansion": { + "version": "1.1.11", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", + "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "dev": true, + "requires": { + "balanced-match": "1.0.0", + "concat-map": "0.0.1" + } + }, + "browser-stdout": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/browser-stdout/-/browser-stdout-1.3.1.tgz", + "integrity": "sha512-qhAVI1+Av2X7qelOfAIYwXONood6XlZE/fXaBSmW/T5SzLAmCgzi+eiWE7fUvbHaeNBQH13UftjpXxsfLkMpgw==", + "dev": true + }, + "commander": { + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.11.0.tgz", + "integrity": "sha512-b0553uYA5YAEGgyYIGYROzKQ7X5RAqedkfjiZxwi0kL1g3bOaBNNZfYkzt/CL0umgD5wc9Jec2FbB98CjkMRvQ==", + "dev": true + }, + "concat-map": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", + "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "dev": true + }, + "debug": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/debug/-/debug-3.1.0.tgz", + "integrity": "sha512-OX8XqP7/1a9cqkxYw2yXss15f26NKWBpDXQd0/uK/KPqdQhxbPa994hnzjcE2VqQpDslf55723cKPUOGSmMY3g==", + "dev": true, + "requires": { + "ms": "2.0.0" + } + }, + "diff": { + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/diff/-/diff-3.5.0.tgz", + "integrity": "sha512-A46qtFgd+g7pDZinpnwiRJtxbC1hpgf0uzP3iG89scHk0AUC7A1TGxf5OiiOUv/JMZR8GOt8hL900hV0bOy5xA==", + "dev": true + }, + "escape-string-regexp": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-1.0.5.tgz", + "integrity": "sha1-G2HAViGQqN/2rjuyzwIAyhMLhtQ=", + "dev": true + }, + "fs.realpath": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", + "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "dev": true + }, + "glob": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", + "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "dev": true, + "requires": { + "fs.realpath": "1.0.0", + "inflight": "1.0.6", + "inherits": "2.0.3", + "minimatch": "3.0.4", + "once": "1.4.0", + "path-is-absolute": "1.0.1" + } + }, + "growl": { + "version": "1.10.3", + "resolved": "https://registry.npmjs.org/growl/-/growl-1.10.3.tgz", + "integrity": "sha512-hKlsbA5Vu3xsh1Cg3J7jSmX/WaW6A5oBeqzM88oNbCRQFz+zUaXm6yxS4RVytp1scBoJzSYl4YAEOQIt6O8V1Q==", + "dev": true + }, + "has-flag": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-2.0.0.tgz", + "integrity": "sha1-6CB68cx7MNRGzHC3NLXovhj4jVE=", + "dev": true + }, + "he": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/he/-/he-1.1.1.tgz", + "integrity": "sha1-k0EP0hsAlzUVH4howvJx80J+I/0=", + "dev": true + }, + "inflight": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", + "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "dev": true, + "requires": { + "once": "1.4.0", + "wrappy": "1.0.2" + } + }, + "inherits": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", + "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", + "dev": true + }, + "minimatch": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", + "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "dev": true, + "requires": { + "brace-expansion": "1.1.11" + } + }, + "minimist": { + "version": "0.0.8", + "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", + "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", + "dev": true + }, + "mkdirp": { + "version": "0.5.1", + "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", + "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "dev": true, + "requires": { + "minimist": "0.0.8" + } + }, + "mocha": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/mocha/-/mocha-5.0.5.tgz", + "integrity": "sha512-3MM3UjZ5p8EJrYpG7s+29HAI9G7sTzKEe4+w37Dg0QP7qL4XGsV+Q2xet2cE37AqdgN1OtYQB6Vl98YiPV3PgA==", + "dev": true, + "requires": { + "browser-stdout": "1.3.1", + "commander": "2.11.0", + "debug": "3.1.0", + "diff": "3.5.0", + "escape-string-regexp": "1.0.5", + "glob": "7.1.2", + "growl": "1.10.3", + "he": "1.1.1", + "mkdirp": "0.5.1", + "supports-color": "4.4.0" + } + }, + "ms": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", + "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "dev": true + }, + "once": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", + "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "dev": true, + "requires": { + "wrappy": "1.0.2" + } + }, + "path-is-absolute": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", + "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "dev": true + }, + "sstab": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/sstab/-/sstab-1.0.2.tgz", + "integrity": "sha512-tWuKsYqPmU0ACX8YshwgCTItXB1fhhndlMge0/PQAYi8mUV96dTLQ9x9lrIqAR9A6T+LeVyeK/qareFQvu6zjw==", + "requires": { + "utf8": "3.0.0" + } + }, + "supports-color": { + "version": "4.4.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-4.4.0.tgz", + "integrity": "sha512-rKC3+DyXWgK0ZLKwmRsrkyHVZAjNkfzeehuFWdGGcqGDTZFH73+RH6S/RDAAxl9GusSjZSUWYLmT9N5pzXFOXQ==", + "dev": true, + "requires": { + "has-flag": "2.0.0" + } + }, + "utf8": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/utf8/-/utf8-3.0.0.tgz", + "integrity": "sha512-E8VjFIQ/TyQgp+TZfS6l8yp/xWppSAHzidGiRrqe4bK4XP9pTRyKFgGJpO3SN7zdX4DeomTrwaseCHovfpFcqQ==" + }, + "wrappy": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", + "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", + "dev": true + } + } +} diff --git a/typo/package.json b/package.json similarity index 75% rename from typo/package.json rename to package.json index 63dc7a42..ff7448a3 100644 --- a/typo/package.json +++ b/package.json @@ -1,12 +1,18 @@ { "name": "typo-js", - "version": "1.0.3", + "version": "1.0.5", "description": "A Hunspell-style spellchecker.", "main": "typo.js", "repository": { "type": "git", "url": "git://github.com/cfinke/Typo.js.git" }, + "bin": { + "typo-precompute": "./bin/precompute-dic.js" + }, + "scripts": { + "test": "mocha" + }, "keywords": [ "spellcheck", "spellchecker", @@ -23,5 +29,11 @@ "tonicExample": "var Typo = require('typo-js'); var dictionary = new Typo('en_US'); dictionary.check('mispelled');", "browser": { "fs": false + }, + "dependencies": { + "sstab": "^1.0.2" + }, + "devDependencies": { + "mocha": "^5.0.5" } } diff --git a/typo/README.md b/src/README.md similarity index 100% rename from typo/README.md rename to src/README.md diff --git a/typo/dictionaries/en_US/README.md b/src/dictionaries/en_US/README.md similarity index 100% rename from typo/dictionaries/en_US/README.md rename to src/dictionaries/en_US/README.md diff --git a/typo/dictionaries/en_US/en_US.aff b/src/dictionaries/en_US/en_US.aff similarity index 100% rename from typo/dictionaries/en_US/en_US.aff rename to src/dictionaries/en_US/en_US.aff diff --git a/typo/dictionaries/en_US/en_US.dic b/src/dictionaries/en_US/en_US.dic similarity index 100% rename from typo/dictionaries/en_US/en_US.dic rename to src/dictionaries/en_US/en_US.dic diff --git a/src/dictionaries/en_US/en_US.json b/src/dictionaries/en_US/en_US.json new file mode 100644 index 00000000..04f83cf8 --- /dev/null +++ b/src/dictionaries/en_US/en_US.json @@ -0,0 +1 @@ +{"compoundRuleCodes":{"1":["1"],"n":["0","1","2","3","4","5","6","7","8","9"],"t":["0th","1th","2th","3th","4th","5th","6th","7th","8th","9th"],"m":["0","2","3","4","5","6","7","8","9"],"p":["0th","1st","2nd","3rd","4th","5th","6th","7th","8th","9th"],"c":["1th","2th","3th"]},"dictionary":"en_US","rules":{"A":{"type":"PFX","combineable":true,"entries":[{"add":"re"}]},"I":{"type":"PFX","combineable":true,"entries":[{"add":"in"}]},"U":{"type":"PFX","combineable":true,"entries":[{"add":"un"}]},"C":{"type":"PFX","combineable":true,"entries":[{"add":"de"}]},"E":{"type":"PFX","combineable":true,"entries":[{"add":"dis"}]},"F":{"type":"PFX","combineable":true,"entries":[{"add":"con"}]},"K":{"type":"PFX","combineable":true,"entries":[{"add":"pro"}]},"V":{"type":"SFX","combineable":false,"entries":[{"add":"ive","match":{},"remove":{}},{"add":"ive","match":{}}]},"N":{"type":"SFX","combineable":true,"entries":[{"add":"ion","match":{},"remove":{}},{"add":"ication","match":{},"remove":{}},{"add":"en","match":{}}]},"X":{"type":"SFX","combineable":true,"entries":[{"add":"ions","match":{},"remove":{}},{"add":"ications","match":{},"remove":{}},{"add":"ens","match":{}}]},"H":{"type":"SFX","combineable":false,"entries":[{"add":"ieth","match":{},"remove":{}},{"add":"th","match":{}}]},"Y":{"type":"SFX","combineable":true,"entries":[{"add":"ly"}]},"G":{"type":"SFX","combineable":true,"entries":[{"add":"ing","match":{},"remove":{}},{"add":"ing","match":{}}]},"J":{"type":"SFX","combineable":true,"entries":[{"add":"ings","match":{},"remove":{}},{"add":"ings","match":{}}]},"D":{"type":"SFX","combineable":true,"entries":[{"add":"d","match":{}},{"add":"ied","match":{},"remove":{}},{"add":"ed","match":{}},{"add":"ed","match":{}}]},"T":{"type":"SFX","combineable":false,"entries":[{"add":"st","match":{}},{"add":"iest","match":{},"remove":{}},{"add":"est","match":{}},{"add":"est","match":{}}]},"R":{"type":"SFX","combineable":true,"entries":[{"add":"r","match":{}},{"add":"ier","match":{},"remove":{}},{"add":"er","match":{}},{"add":"er","match":{}}]},"Z":{"type":"SFX","combineable":true,"entries":[{"add":"rs","match":{}},{"add":"iers","match":{},"remove":{}},{"add":"ers","match":{}},{"add":"ers","match":{}}]},"S":{"type":"SFX","combineable":true,"entries":[{"add":"ies","match":{},"remove":{}},{"add":"s","match":{}},{"add":"es","match":{}},{"add":"s","match":{}}]},"P":{"type":"SFX","combineable":true,"entries":[{"add":"iness","match":{},"remove":{}},{"add":"ness","match":{}},{"add":"ness","match":{}}]},"M":{"type":"SFX","combineable":true,"entries":[{"add":"'s"}]},"B":{"type":"SFX","combineable":true,"entries":[{"add":"able","match":{}},{"add":"able","match":{}},{"add":"able","match":{},"remove":{}}]},"L":{"type":"SFX","combineable":true,"entries":[{"add":"ment"}]}},"compoundRules":["/(0|1|2|3|4|5|6|7|8|9)*(1)(0th|1th|2th|3th|4th|5th|6th|7th|8th|9th)/i","/(0|1|2|3|4|5|6|7|8|9)*(0|2|3|4|5|6|7|8|9)(0th|1st|2nd|3rd|4th|5th|6th|7th|8th|9th)/i"],"replacementTable":[["a","ei"],["ei","a"],["a","ey"],["ey","a"],["ai","ie"],["ie","ai"],["are","air"],["are","ear"],["are","eir"],["air","are"],["air","ere"],["ere","air"],["ere","ear"],["ere","eir"],["ear","are"],["ear","air"],["ear","ere"],["eir","are"],["eir","ere"],["ch","te"],["te","ch"],["ch","ti"],["ti","ch"],["ch","tu"],["tu","ch"],["ch","s"],["s","ch"],["ch","k"],["k","ch"],["f","ph"],["ph","f"],["gh","f"],["f","gh"],["i","igh"],["igh","i"],["i","uy"],["uy","i"],["i","ee"],["ee","i"],["j","di"],["di","j"],["j","gg"],["gg","j"],["j","ge"],["ge","j"],["s","ti"],["ti","s"],["s","ci"],["ci","s"],["k","cc"],["cc","k"],["k","qu"],["qu","k"],["kw","qu"],["o","eau"],["eau","o"],["o","ew"],["ew","o"],["oo","ew"],["ew","oo"],["ew","ui"],["ui","ew"],["oo","ui"],["ui","oo"],["ew","u"],["u","ew"],["oo","u"],["u","oo"],["u","oe"],["oe","u"],["u","ieu"],["ieu","u"],["ue","ew"],["ew","ue"],["uff","ough"],["oo","ieu"],["ieu","oo"],["ier","ear"],["ear","ier"],["ear","air"],["air","ear"],["w","qu"],["qu","w"],["z","ss"],["ss","z"],["shun","tion"],["shun","sion"],["shun","cion"]],"flags":{"SET":"ISO8859-1","TRY":"esianrtolcdugmphbyfvkwzESIANRTOLCDUGMPHBYFVKWZ'","NOSUGGEST":"!","COMPOUNDMIN":"1","ONLYINCOMPOUND":"c","WORDCHARS":"0123456789"},"loaded":true} \ No newline at end of file diff --git a/src/dictionaries/en_US/en_US.sst b/src/dictionaries/en_US/en_US.sst new file mode 100644 index 00000000..a2ac27a1 Binary files /dev/null and b/src/dictionaries/en_US/en_US.sst differ diff --git a/typo/typo.js b/src/typo.js similarity index 67% rename from typo/typo.js rename to src/typo.js index d66353c5..3c13226e 100644 --- a/typo/typo.js +++ b/src/typo.js @@ -5,7 +5,7 @@ /* globals module: false */ /** - * Typo is a JavaScript implementation of a spellchecker using hunspell-style + * Typo is a JavaScript implementation of a spellchecker using hunspell-style * dictionaries. */ @@ -17,81 +17,275 @@ var Typo; /** * Typo constructor. * - * @param {String} [dictionary] The locale code of the dictionary being used. e.g., - * "en_US". This is only used to auto-load dictionaries. - * @param {String} [affData] The data from the dictionary's .aff file. If omitted - * and Typo.js is being used in a Chrome extension, the .aff - * file will be loaded automatically from - * lib/typo/dictionaries/[dictionary]/[dictionary].aff - * In other environments, it will be loaded from - * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].aff - * @param {String} [wordsData] The data from the dictionary's .dic file. If omitted - * and Typo.js is being used in a Chrome extension, the .dic - * file will be loaded automatically from - * lib/typo/dictionaries/[dictionary]/[dictionary].dic - * In other environments, it will be loaded from - * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].dic - * @param {Object} [settings] Constructor settings. Available properties are: - * {String} [dictionaryPath]: path to load dictionary from in non-chrome - * environment. - * {Object} [flags]: flag information. - * {Boolean} [asyncLoad]: If true, affData and wordsData will be loaded - * asynchronously. - * {Function} [loadedCallback]: Called when both affData and wordsData - * have been loaded. Only used if asyncLoad is set to true. The parameter - * is the instantiated Typo object. - * * @returns {Typo} A Typo object. */ -Typo = function (dictionary, affData, wordsData, settings) { - settings = settings || {}; +Typo = function () { this.dictionary = null; - + this.rules = {}; this.dictionaryTable = {}; - + this.compoundRules = []; this.compoundRuleCodes = {}; - + this.replacementTable = []; - - this.flags = settings.flags || {}; - + + this.flags = {}; + this.memoized = {}; - this.loaded = false; - - var self = this; - - var path; - - // Loop-control variables. - var i, j, _len, _jlen; - - if (dictionary) { - self.dictionary = dictionary; + if(arguments.length > 0) { + this.load.apply(this, arguments); + } + + + return this; +}; + +Typo.prototype = { + + /** + * Loads a Typo instance from a hash of all of the Typo properties. + * + * @param object obj A hash of Typo properties, probably gotten from a JSON.parse(JSON.stringify(typo_instance)). + */ + + loadRaw : function (obj) { + for (var i in obj) { + if (obj.hasOwnProperty(i)) { + + if(i === 'compoundRules') { + obj[i] = obj[i].map(function(r) { + var m = r.match(/\/(.*)\/(.*)?/); + return new RegExp(m[1], m[2] || ''); + }); + } + + this[i] = obj[i]; + } + } + + return this; + }, + + /** + * Similar to load() except loads the precomputed .json and .sst files + * + * NOTE: This is currently only supported in environments with require() + * NOTE: In order to create these files see the 'bin/precompute-dic.js' script with usage in the README. + * NOTE: sstData if given is expected as + * + * @param {String} dictionary The locale code of the dictionary being used. e.g., + * "en_US". This is only used to auto-load dictionaries. + * @param {String} [jsonData] The data from the dictionary's .json file. If omitted + * and Typo.js is being used in a Chrome extension, the .json + * file will be loaded automatically from + * lib/typo/dictionaries/[dictionary]/[dictionary].json + * In other environments, it will be loaded from + * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].json + * @param {ArrayBuffer} [sstData] The data from the dictionary's .sst file. If omitted + * and Typo.js is being used in a Chrome extension, the .sst + * file will be loaded automatically from + * lib/typo/dictionaries/[dictionary]/[dictionary].sst + * In other environments, it will be loaded from + * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].sst + * @param {Object} [settings] Loader settings. Available properties are: + * {String} [dictionaryPath]: path to load dictionary from in non-chrome + * environment. + * {Boolean} [asyncLoad]: If true, affData and wordsData will be loaded + * asynchronously. + * {Function} [loadedCallback]: Called when both affData and wordsData + * have been loaded. Only used if asyncLoad is set to true. The parameter + * is the instantiated Typo object. + */ + loadPrecomputed : function (dictionary, jsonData, sstData, settings) { + + var SSTable = require('sstab/dist/sstable'); + + + settings = settings || {}; + + + if(!jsonData) { + jsonData = this._readFile(this._resolveFilePath(dictionary, 'json', settings.dictionaryPath), 'utf8', settings.asyncLoad); + } + else if(settings.asyncLoad) { + jsonData = Promise.resolve(jsonData); + } + + if(!sstData) { + sstData = this._readFile(this._resolveFilePath(dictionary, 'sst', settings.dictionaryPath), null, settings.asyncLoad, true); + } + else if(settings.asyncLoad) { + sstData = Promise.resolve(sstData); + } + + + var self = this; + + + if(settings.asyncLoad) { + Promise.all([jsonData, sstData]).then(function(values) { + jsonData = values[0]; + sstData = values[1]; + finishLoading(); + }); + } + else { + finishLoading(); + } + + + + function finishLoading() { + + var props = JSON.parse(jsonData); + + self.loadRaw(props); + + var table = new SSTable(sstData); + + self.dictionaryTable = table; + self._getDictionaryEntry = function(k) { + + var s = self.dictionaryTable.get(k); + if(s === undefined) { + return s; + } + else if (s == '') { + return null; + } + + return [s.split('')]; + } + + + if(settings.asyncLoad && settings.loadedCallback) { + settings.loadedCallback(self); + } + + } + + }, + + + /** + * Loads the library from remote files + * + * NOTE: If a character set is given on the .aff file and it is not ISO8859-1, then it must be manually given as a setting. This library currently does not support automatic parsing of that setting. + * + * @param {String} dictionary The locale code of the dictionary being used. e.g., + * "en_US". This is only used to auto-load dictionaries. + * @param {String} [affData] The data from the dictionary's .aff file. If omitted + * and Typo.js is being used in a Chrome extension, the .aff + * file will be loaded automatically from + * lib/typo/dictionaries/[dictionary]/[dictionary].aff + * In other environments, it will be loaded from + * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].aff + * @param {String} [wordsData] The data from the dictionary's .dic file. If omitted + * and Typo.js is being used in a Chrome extension, the .dic + * file will be loaded automatically from + * lib/typo/dictionaries/[dictionary]/[dictionary].dic + * In other environments, it will be loaded from + * [settings.dictionaryPath]/dictionaries/[dictionary]/[dictionary].dic + * @param {Object} [settings] Loader settings. Available properties are: + * {String} [dictionaryPath]: path to load dictionary from in non-chrome + * environment. + * {Object} [flags]: flag information. + * {Boolean} [asyncLoad]: If true, affData and wordsData will be loaded + * asynchronously. + * {Function} [loadedCallback]: Called when both affData and wordsData + * have been loaded. Only used if asyncLoad is set to true. The parameter + * is the instantiated Typo object. + * {String} [charset]: The character set specified on the first line of the + * .aff file if specified. + */ + load : function(dictionary, affData, wordsData, settings) { + + settings = settings || {}; + + this.flags = settings.flags || {}; + + + var self = this; + + var path; + + // Loop-control variables. + var i, j, _len, _jlen; + + + this.dictionary = dictionary; + // If the data is preloaded, just setup the Typo object. if (affData && wordsData) { setup(); } + else { + if (!affData) readDataFile(this._resolveFilePath(dictionary, 'aff', settings.dictionaryPath), setAffData); + if (!wordsData) readDataFile(this._resolveFilePath(dictionary, 'dic', settings.dictionaryPath), setWordsData); + } + + function readDataFile(url, setFunc) { + var response = self._readFile(url, settings.charset, settings.asyncLoad); + + if (settings.asyncLoad) { + response.then(function(data) { + setFunc(data); + }); + } + else { + setFunc(response); + } + } + + function setAffData(data) { + affData = data; + + if (wordsData) { + setup(); + } + } + + function setWordsData(data) { + wordsData = data; + + if (affData) { + setup(); + } + } + + function setup() { + self._loadAff(affData); + + self.dictionaryTable = self._parseDIC(wordsData); + + self._loadFinish(settings); + } + + }, + + _resolveFilePath : function(dictionary, extension, dictionaryPath) { + + var path; + // Loading data for Chrome extentions. - else if (typeof window !== 'undefined' && 'chrome' in window && 'extension' in window.chrome && 'getURL' in window.chrome.extension) { - if (settings.dictionaryPath) { - path = settings.dictionaryPath; + if (typeof window !== 'undefined' && 'chrome' in window && 'extension' in window.chrome && 'getURL' in window.chrome.extension) { + if (dictionaryPath) { + path = dictionaryPath; } else { - path = "typo/dictionaries"; + path = "src/dictionaries"; } - - if (!affData) readDataFile(chrome.extension.getURL(path + "/" + dictionary + "/" + dictionary + ".aff"), setAffData); - if (!wordsData) readDataFile(chrome.extension.getURL(path + "/" + dictionary + "/" + dictionary + ".dic"), setWordsData); + + return chrome.extension.getURL(path + "/" + dictionary + "/" + dictionary + "." + extension); } else { - if (settings.dictionaryPath) { - path = settings.dictionaryPath; + if (dictionaryPath) { + path = dictionaryPath; } else if (typeof __dirname !== 'undefined') { path = __dirname + '/dictionaries'; @@ -99,123 +293,88 @@ Typo = function (dictionary, affData, wordsData, settings) { else { path = './dictionaries'; } - - if (!affData) readDataFile(path + "/" + dictionary + "/" + dictionary + ".aff", setAffData); - if (!wordsData) readDataFile(path + "/" + dictionary + "/" + dictionary + ".dic", setWordsData); - } - } - - function readDataFile(url, setFunc) { - var response = self._readFile(url, null, settings.asyncLoad); - - if (settings.asyncLoad) { - response.then(function(data) { - setFunc(data); - }); - } - else { - setFunc(response); - } - } - - function setAffData(data) { - affData = data; - if (wordsData) { - setup(); + return path + "/" + dictionary + "/" + dictionary + "." + extension; } - } + }, - function setWordsData(data) { - wordsData = data; + // Loads the AFF From a string containing it + _loadAff : function(data) { + var i, j, _len, _jlen; - if (affData) { - setup(); - } - } + this.rules = this._parseAFF(data); - function setup() { - self.rules = self._parseAFF(affData); - // Save the rule codes that are used in compound rules. - self.compoundRuleCodes = {}; - - for (i = 0, _len = self.compoundRules.length; i < _len; i++) { - var rule = self.compoundRules[i]; - + this.compoundRuleCodes = {}; + + for (i = 0, _len = this.compoundRules.length; i < _len; i++) { + var rule = this.compoundRules[i]; + for (j = 0, _jlen = rule.length; j < _jlen; j++) { - self.compoundRuleCodes[rule[j]] = []; + this.compoundRuleCodes[rule[j]] = []; } } - + // If we add this ONLYINCOMPOUND flag to self.compoundRuleCodes, then _parseDIC // will do the work of saving the list of words that are compound-only. - if ("ONLYINCOMPOUND" in self.flags) { - self.compoundRuleCodes[self.flags.ONLYINCOMPOUND] = []; + if ("ONLYINCOMPOUND" in this.flags) { + this.compoundRuleCodes[this.flags.ONLYINCOMPOUND] = []; } - - self.dictionaryTable = self._parseDIC(wordsData); - - // Get rid of any codes from the compound rule codes that are never used - // (or that were special regex characters). Not especially necessary... - for (i in self.compoundRuleCodes) { - if (self.compoundRuleCodes[i].length === 0) { - delete self.compoundRuleCodes[i]; + + }, + + + // Perform any steps needing after both the DIC and the AFF are in + _loadFinish : function (settings) { + + // Loop-control variables. + var i, j, _len, _jlen; + + + // Get rid of any codes from the compound rule codes that are never used + // (or that were special regex characters). Not especially necessary... + for (i in this.compoundRuleCodes) { + if (this.compoundRuleCodes[i].length === 0) { + delete this.compoundRuleCodes[i]; } } - + // Build the full regular expressions for each compound rule. - // I have a feeling (but no confirmation yet) that this method of + // I have a feeling (but no confirmation yet) that this method of // testing for compound words is probably slow. - for (i = 0, _len = self.compoundRules.length; i < _len; i++) { - var ruleText = self.compoundRules[i]; - + for (i = 0, _len = this.compoundRules.length; i < _len; i++) { + var ruleText = this.compoundRules[i]; + var expressionText = ""; - + for (j = 0, _jlen = ruleText.length; j < _jlen; j++) { var character = ruleText[j]; - - if (character in self.compoundRuleCodes) { - expressionText += "(" + self.compoundRuleCodes[character].join("|") + ")"; + + if (character in this.compoundRuleCodes) { + expressionText += "(" + this.compoundRuleCodes[character].join("|") + ")"; } else { expressionText += character; } } - - self.compoundRules[i] = new RegExp(expressionText, "i"); + + this.compoundRules[i] = new RegExp(expressionText, "i"); } - - self.loaded = true; - + + this.loaded = true; + if (settings.asyncLoad && settings.loadedCallback) { - settings.loadedCallback(self); + settings.loadedCallback(this); } - } - - return this; -}; + }, -Typo.prototype = { - /** - * Loads a Typo instance from a hash of all of the Typo properties. - * - * @param object obj A hash of Typo properties, probably gotten from a JSON.parse(JSON.stringify(typo_instance)). - */ - - load : function (obj) { - for (var i in obj) { - if (obj.hasOwnProperty(i)) { - this[i] = obj[i]; - } - } - - return this; + _getDictionaryEntry : function(key) { + return this.dictionaryTable[key]; }, - + /** * Read the contents of a file. - * + * * @param {String} path The path (relative) to the file. * @param {String} [charset="ISO8859-1"] The expected charset of the file * @param {Boolean} async If true, the file will be read asynchronously. For node.js this does nothing, all @@ -223,54 +382,72 @@ Typo.prototype = { * @returns {String} The file data if async is false, otherwise a promise object. If running node.js, the data is * always returned. */ - - _readFile : function (path, charset, async) { - charset = charset || "utf8"; - + + _readFile : function (path, charset, async, arrayBuffer) { + // NOTE: Node 6.4.0+ is required for the default character sets + charset = charset || "ISO8859-1"; + if (typeof XMLHttpRequest !== 'undefined') { var promise; var req = new XMLHttpRequest(); req.open("GET", path, async); - + if (async) { promise = new Promise(function(resolve, reject) { req.onload = function() { if (req.status === 200) { - resolve(req.responseText); + resolve(arrayBuffer? req.response : req.responseText); } else { reject(req.statusText); } }; - + req.onerror = function() { reject(req.statusText); } }); } - - if (req.overrideMimeType) + + if (arrayBuffer) { + req.responseType = "arraybuffer"; + } + else if (req.overrideMimeType) { req.overrideMimeType("text/plain; charset=" + charset); - + } + req.send(null); - - return async ? promise : req.responseText; + + return async ? promise : (buffer? req.response : req.responseText); } else if (typeof require !== 'undefined') { // Node.js var fs = require("fs"); + + // Some charsets go by another name for node buffer + if(charset.toUpperCase() === 'ISO8859-1') { + charset = 'latin1'; + } try { if (fs.existsSync(path)) { var stats = fs.statSync(path); - + var fileDescriptor = fs.openSync(path, 'r'); - + var buffer = new Buffer(stats.size); - + fs.readSync(fileDescriptor, buffer, 0, buffer.length, null); - - return buffer.toString(charset, 0, buffer.length); + + var val; + if(arrayBuffer) { + val = buffer.buffer; + } + else { + val = buffer.toString(charset, 0, buffer.length);; + } + + return async? Promise.resolve(val) : val; } else { console.log("Path " + path + " does not exist."); @@ -281,59 +458,59 @@ Typo.prototype = { } } }, - + /** * Parse the rules out from a .aff file. * * @param {String} data The contents of the affix file. * @returns object The rules from the file. */ - + _parseAFF : function (data) { var rules = {}; - + var line, subline, numEntries, lineParts; var i, j, _len, _jlen; - + // Remove comment lines data = this._removeAffixComments(data); - + var lines = data.split("\n"); - + for (i = 0, _len = lines.length; i < _len; i++) { line = lines[i]; - + var definitionParts = line.split(/\s+/); - + var ruleType = definitionParts[0]; - + if (ruleType == "PFX" || ruleType == "SFX") { var ruleCode = definitionParts[1]; var combineable = definitionParts[2]; numEntries = parseInt(definitionParts[3], 10); - + var entries = []; - + for (j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) { subline = lines[j]; - + lineParts = subline.split(/\s+/); var charactersToRemove = lineParts[2]; - + var additionParts = lineParts[3].split("/"); - + var charactersToAdd = additionParts[0]; if (charactersToAdd === "0") charactersToAdd = ""; - + var continuationClasses = this.parseRuleCodes(additionParts[1]); - + var regexToMatch = lineParts[4]; - + var entry = {}; entry.add = charactersToAdd; - + if (continuationClasses.length > 0) entry.continuationClasses = continuationClasses; - + if (regexToMatch !== ".") { if (ruleType === "SFX") { entry.match = new RegExp(regexToMatch + "$"); @@ -342,7 +519,7 @@ Typo.prototype = { entry.match = new RegExp("^" + regexToMatch); } } - + if (charactersToRemove != "0") { if (ruleType === "SFX") { entry.remove = new RegExp(charactersToRemove + "$"); @@ -351,29 +528,29 @@ Typo.prototype = { entry.remove = charactersToRemove; } } - + entries.push(entry); } - + rules[ruleCode] = { "type" : ruleType, "combineable" : (combineable == "Y"), "entries" : entries }; - + i += numEntries; } else if (ruleType === "COMPOUNDRULE") { numEntries = parseInt(definitionParts[1], 10); - + for (j = i + 1, _jlen = i + 1 + numEntries; j < _jlen; j++) { line = lines[j]; - + lineParts = line.split(/\s+/); this.compoundRules.push(lineParts[1]); } - + i += numEntries; } else if (ruleType === "REP") { lineParts = line.split(/\s+/); - + if (lineParts.length === 3) { this.replacementTable.push([ lineParts[1], lineParts[2] ]); } @@ -384,21 +561,21 @@ Typo.prototype = { // FLAG // KEEPCASE // NEEDAFFIX - + this.flags[ruleType] = definitionParts[1]; } } - + return rules; }, - + /** * Removes comment lines and then cleans up blank lines and trailing whitespace. * * @param {String} data The data from an affix file. * @return {String} The cleaned-up data. */ - + _removeAffixComments : function (data) { // Remove comments // This used to remove any string starting with '#' up to the end of the line, @@ -406,19 +583,19 @@ Typo.prototype = { // I haven't seen any affix files that use comments on the same line as real data, // so I don't think this will break anything. data = data.replace(/^\s*#.*$/mg, ""); - + // Trim each line data = data.replace(/^\s\s*/m, '').replace(/\s\s*$/m, ''); - + // Remove blank lines. data = data.replace(/\n{2,}/g, "\n"); - + // Trim the entire string data = data.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); - + return data; }, - + /** * Parses the words out from the .dic file. * @@ -426,19 +603,19 @@ Typo.prototype = { * @returns object The lookup table containing all of the words and * word forms from the dictionary. */ - + _parseDIC : function (data) { data = this._removeDicComments(data); - + var lines = data.split("\n"); var dictionaryTable = {}; - + function addWord(word, rules) { // Some dictionaries will list the same word multiple times with different rule sets. if (!dictionaryTable.hasOwnProperty(word)) { dictionaryTable[word] = null; } - + if (rules.length > 0) { if (dictionaryTable[word] === null) { dictionaryTable[word] = []; @@ -447,52 +624,52 @@ Typo.prototype = { dictionaryTable[word].push(rules); } } - + // The first line is the number of words in the dictionary. for (var i = 1, _len = lines.length; i < _len; i++) { var line = lines[i]; - + if (!line) { // Ignore empty lines. continue; } var parts = line.split("/", 2); - + var word = parts[0]; // Now for each affix rule, generate that form of the word. if (parts.length > 1) { var ruleCodesArray = this.parseRuleCodes(parts[1]); - + // Save the ruleCodes for compound word situations. if (!("NEEDAFFIX" in this.flags) || ruleCodesArray.indexOf(this.flags.NEEDAFFIX) == -1) { addWord(word, ruleCodesArray); } - + for (var j = 0, _jlen = ruleCodesArray.length; j < _jlen; j++) { var code = ruleCodesArray[j]; - + var rule = this.rules[code]; - + if (rule) { var newWords = this._applyRule(word, rule); - + for (var ii = 0, _iilen = newWords.length; ii < _iilen; ii++) { var newWord = newWords[ii]; - + addWord(newWord, []); - + if (rule.combineable) { for (var k = j + 1; k < _jlen; k++) { var combineCode = ruleCodesArray[k]; - + var combineRule = this.rules[combineCode]; - + if (combineRule) { if (combineRule.combineable && (rule.type != combineRule.type)) { var otherNewWords = this._applyRule(newWord, combineRule); - + for (var iii = 0, _iiilen = otherNewWords.length; iii < _iiilen; iii++) { var otherNewWord = otherNewWords[iii]; addWord(otherNewWord, []); @@ -503,7 +680,7 @@ Typo.prototype = { } } } - + if (code in this.compoundRuleCodes) { this.compoundRuleCodes[code].push(word); } @@ -513,28 +690,28 @@ Typo.prototype = { addWord(word.trim(), []); } } - + return dictionaryTable; }, - - + + /** * Removes comment lines and then cleans up blank lines and trailing whitespace. * * @param {String} data The data from a .dic file. * @return {String} The cleaned-up data. */ - + _removeDicComments : function (data) { // I can't find any official documentation on it, but at least the de_DE // dictionary uses tab-indented lines as comments. - + // Remove comments data = data.replace(/^\t.*$/mg, ""); - + return data; }, - + parseRuleCodes : function (textCodes) { if (!textCodes) { return []; @@ -544,18 +721,18 @@ Typo.prototype = { } else if (this.flags.FLAG === "long") { var flags = []; - + for (var i = 0, _len = textCodes.length; i < _len; i += 2) { flags.push(textCodes.substr(i, 2)); } - + return flags; } else if (this.flags.FLAG === "num") { return textCodes.split(","); } }, - + /** * Applies an affix rule to a word. * @@ -563,41 +740,41 @@ Typo.prototype = { * @param {Object} rule The affix rule. * @returns {String[]} The new words generated by the rule. */ - + _applyRule : function (word, rule) { var entries = rule.entries; var newWords = []; - + for (var i = 0, _len = entries.length; i < _len; i++) { var entry = entries[i]; - + if (!entry.match || word.match(entry.match)) { var newWord = word; - + if (entry.remove) { newWord = newWord.replace(entry.remove, ""); } - + if (rule.type === "SFX") { newWord = newWord + entry.add; } else { newWord = entry.add + newWord; } - + newWords.push(newWord); - + if ("continuationClasses" in entry) { for (var j = 0, _jlen = entry.continuationClasses.length; j < _jlen; j++) { var continuationRule = this.rules[entry.continuationClasses[j]]; - + if (continuationRule) { newWords = newWords.concat(this._applyRule(newWord, continuationRule)); } /* else { // This shouldn't happen, but it does, at least in the de_DE dictionary. - // I think the author mistakenly supplied lower-case rule codes instead + // I think the author mistakenly supplied lower-case rule codes instead // of upper-case. } */ @@ -605,10 +782,10 @@ Typo.prototype = { } } } - + return newWords; }, - + /** * Checks whether a word or a capitalization variant exists in the current dictionary. * The word is trimmed and several variations of capitalizations are checked. @@ -619,68 +796,68 @@ Typo.prototype = { * @param {String} aWord The word to check. * @returns {Boolean} */ - + check : function (aWord) { if (!this.loaded) { throw "Dictionary not loaded."; } - + // Remove leading and trailing whitespace var trimmedWord = aWord.replace(/^\s\s*/, '').replace(/\s\s*$/, ''); - + if (this.checkExact(trimmedWord)) { return true; } - + // The exact word is not in the dictionary. if (trimmedWord.toUpperCase() === trimmedWord) { // The word was supplied in all uppercase. // Check for a capitalized form of the word. var capitalizedWord = trimmedWord[0] + trimmedWord.substring(1).toLowerCase(); - + if (this.hasFlag(capitalizedWord, "KEEPCASE")) { // Capitalization variants are not allowed for this word. return false; } - + if (this.checkExact(capitalizedWord)) { return true; } } - + var lowercaseWord = trimmedWord.toLowerCase(); - + if (lowercaseWord !== trimmedWord) { if (this.hasFlag(lowercaseWord, "KEEPCASE")) { // Capitalization variants are not allowed for this word. return false; } - + // Check for a lowercase form if (this.checkExact(lowercaseWord)) { return true; } } - + return false; }, - + /** * Checks whether a word exists in the current dictionary. * * @param {String} word The word to check. * @returns {Boolean} */ - + checkExact : function (word) { if (!this.loaded) { throw "Dictionary not loaded."; } - var ruleCodes = this.dictionaryTable[word]; - + var ruleCodes = this._getDictionaryEntry(word); + var i, _len; - + if (typeof ruleCodes === 'undefined') { // Check if this might be a compound word. if ("COMPOUNDMIN" in this.flags && word.length >= this.flags.COMPOUNDMIN) { @@ -706,7 +883,7 @@ Typo.prototype = { return false; }, - + /** * Looks up whether a given word is flagged with a given flag. * @@ -714,7 +891,7 @@ Typo.prototype = { * @param {String} flag The flag in question. * @return {Boolean} */ - + hasFlag : function (word, flag, wordFlags) { if (!this.loaded) { throw "Dictionary not loaded."; @@ -722,17 +899,17 @@ Typo.prototype = { if (flag in this.flags) { if (typeof wordFlags === 'undefined') { - wordFlags = Array.prototype.concat.apply([], this.dictionaryTable[word]); + wordFlags = Array.prototype.concat.apply([], this._getDictionaryEntry(word)); } - + if (wordFlags && wordFlags.indexOf(this.flags[flag]) !== -1) { return true; } } - + return false; }, - + /** * Returns a list of suggestions for a misspelled word. * @@ -743,9 +920,9 @@ Typo.prototype = { * @param {Number} [limit=5] The maximum number of suggestions to return. * @returns {String[]} The array of suggestions. */ - + alphabet : "", - + suggest : function (word, limit) { if (!this.loaded) { throw "Dictionary not loaded."; @@ -762,46 +939,46 @@ Typo.prototype = { return this.memoized[word]['suggestions'].slice(0, limit); } } - + if (this.check(word)) return []; - + // Check the replacement table. for (var i = 0, _len = this.replacementTable.length; i < _len; i++) { var replacementEntry = this.replacementTable[i]; - + if (word.indexOf(replacementEntry[0]) !== -1) { var correctedWord = word.replace(replacementEntry[0], replacementEntry[1]); - + if (this.check(correctedWord)) { return [ correctedWord ]; } } } - + var self = this; self.alphabet = "abcdefghijklmnopqrstuvwxyz"; - + /* if (!self.alphabet) { // Use the alphabet as implicitly defined by the words in the dictionary. var alphaHash = {}; - + for (var i in self.dictionaryTable) { for (var j = 0, _len = i.length; j < _len; j++) { alphaHash[i[j]] = true; } } - + for (var i in alphaHash) { self.alphabet += i; } - + var alphaArray = self.alphabet.split(""); alphaArray.sort(); self.alphabet = alphaArray.join(""); } */ - + /** * Returns a hash keyed by all of the strings that can be made by making a single edit to the word (or words in) `words` * The value of each entry is the number of unique ways that the resulting word can be made. @@ -811,9 +988,9 @@ Typo.prototype = { */ function edits1(words, known_only) { var rv = {}; - + var i, j, _iilen, _len, _jlen, _edit; - + if (typeof words == 'string') { var word = words; words = {}; @@ -823,7 +1000,7 @@ Typo.prototype = { for (var word in words) { for (i = 0, _len = word.length + 1; i < _len; i++) { var s = [ word.substring(0, i), word.substring(i) ]; - + if (s[1]) { _edit = s[0] + s[1].substring(1); @@ -836,7 +1013,7 @@ Typo.prototype = { } } } - + // Eliminate transpositions of identical letters if (s[1].length > 1 && s[1][1] !== s[1][0]) { _edit = s[0] + s[1][1] + s[1][0] + s[1].substring(2); @@ -885,7 +1062,7 @@ Typo.prototype = { } } } - + return rv; } @@ -893,10 +1070,10 @@ Typo.prototype = { // Get the edit-distance-1 and edit-distance-2 forms of this word. var ed1 = edits1(word); var ed2 = edits1(ed1, true); - + // Sort the edits based on how many different ways they were created. var weighted_corrections = ed2; - + for (var ed1word in ed1) { if (!self.check(ed1word)) { continue; @@ -909,40 +1086,40 @@ Typo.prototype = { weighted_corrections[ed1word] = ed1[ed1word]; } } - + var i, _len; var sorted_corrections = []; - + for (i in weighted_corrections) { if (weighted_corrections.hasOwnProperty(i)) { sorted_corrections.push([ i, weighted_corrections[i] ]); } } - + function sorter(a, b) { if (a[1] < b[1]) { return -1; } - + // @todo If a and b are equally weighted, add our own weight based on something like the key locations on this language's default keyboard. return 1; } - + sorted_corrections.sort(sorter).reverse(); var rv = []; var capitalization_scheme = "lowercase"; - + if (word.toUpperCase() === word) { capitalization_scheme = "uppercase"; } else if (word.substr(0, 1).toUpperCase() + word.substr(1).toLowerCase() === word) { capitalization_scheme = "capitalized"; } - + var working_limit = limit; for (i = 0; i < Math.min(working_limit, sorted_corrections.length); i++) { @@ -952,7 +1129,7 @@ Typo.prototype = { else if ("capitalized" === capitalization_scheme) { sorted_corrections[i][0] = sorted_corrections[i][0].substr(0, 1).toUpperCase() + sorted_corrections[i][0].substr(1); } - + if (!self.hasFlag(sorted_corrections[i][0], "NOSUGGEST") && rv.indexOf(sorted_corrections[i][0]) == -1) { rv.push(sorted_corrections[i][0]); } @@ -964,7 +1141,7 @@ Typo.prototype = { return rv; } - + this.memoized[word] = { 'suggestions': correct(word), 'limit': limit @@ -978,4 +1155,4 @@ Typo.prototype = { // Support for use as a node.js module. if (typeof module !== 'undefined') { module.exports = Typo; -} \ No newline at end of file +} diff --git a/test/english.js b/test/english.js new file mode 100644 index 00000000..899b7514 --- /dev/null +++ b/test/english.js @@ -0,0 +1,202 @@ +var assert = require('assert'); +var Typo = require('../src/typo'); + +var equal = assert.equal; +var deepEqual = assert.deepEqual; + +// NOTE: This is mostly duplicated from tests/english.js for testing the precomputed functions only available in node-like environments +describe('english', function() { + + testGenerator(function(dict, next) { + dict.load('en_US'); + next(); + }) + + context('precomputed (sync)', function() { + testGenerator(function(dict, done) { + dict.loadPrecomputed('en_US') + done(); + }) + }); + + context('precomputed (async)', function() { + testGenerator(function(dict, done) { + dict.loadPrecomputed('en_US', null, null, { + asyncLoad: true, + loadedCallback: () => done() + }) + }) + }); + + +}); + +function testGenerator(b) { + + var dict; + + before(function(done) { + dict = new Typo(); + b(dict, done); + }); + + + it("Dictionary object attributes are properly set", function () { + equal(dict.dictionary, "en_US"); + }); + + it("Suggestions", function () { + deepEqual(dict.suggest("speling", 3), [ "spelling", "spieling", "spewing" ]); + + // Repeated calls function properly. + deepEqual(dict.suggest("speling", 1), [ "spelling" ]); + deepEqual(dict.suggest("speling"), [ "spelling", "spieling", "spewing", "selling", "peeling" ]); + deepEqual(dict.suggest("speling", 2), [ "spelling", "spieling" ]); + deepEqual(dict.suggest("speling"), [ "spelling", "spieling", "spewing", "selling", "peeling" ]); + + // Requesting more suggestions than will be returned doesn't break anything. + deepEqual(dict.suggest("spartang", 50), [ "spartan", "sparing", "starting", "sprang", "sporting", "spurting", "smarting", "sparking", "sparling", "sparring", "parting", "spatting" ]); + deepEqual(dict.suggest("spartang", 30), [ "spartan", "sparing", "starting", "sprang", "sporting", "spurting", "smarting", "sparking", "sparling", "sparring", "parting", "spatting" ]); + deepEqual(dict.suggest("spartang", 1), [ "spartan" ]); + + deepEqual(dict.suggest("spitting"), [ ], "Correctly spelled words receive no suggestions."); + deepEqual(dict.suggest("spitting"), [ ], "Correctly spelled words receive no suggestions."); + + // Words that are object properties don't break anything. + deepEqual(dict.suggest("length"), [ ], "Correctly spelled words receive no suggestions."); + deepEqual(dict.suggest("length"), [ ], "Correctly spelled words receive no suggestions."); + }); + + it("Correct checking of words with no affixes", function () { + equal(dict.check("I"), true); + equal(dict.check("is"), true); + equal(dict.check("makes"), true); + equal(dict.check("example"), true); + equal(dict.check("a"), true); + equal(dict.check("aback"), true); + equal(dict.check("juicily"), true); + equal(dict.check("palmate"), true); + equal(dict.check("palpable"), true); + }); + + it("Correct checking of root words with single affixes (affixes not used)", function () { + equal(dict.check("paling"), true); + equal(dict.check("arrangeable"), true); + equal(dict.check("arrant"), true); + equal(dict.check("swabby"), true); + }); + + it("Correct checking of root words with single affixes (affixes used)", function () { + equal(dict.check("palmer's"), true); + equal(dict.check("uncritically"), true); + equal(dict.check("hypersensitiveness"), true); + equal(dict.check("illusive"), true); + }); + + it("Capitalization is respected.", function () { + equal(dict.check("A"), true); + equal(dict.check("a"), true); + equal(dict.check("AA"), true); + equal(dict.check("ABANDONER"), true); + equal(dict.check("abandonER"), true); + equal(dict.check("Abandoner"), true); + equal(dict.check("Abbe"), true); + equal(dict.check("Abbott's"), true); + equal(dict.check("abbott's"), false); + equal(dict.check("Abba"), true); + equal(dict.check("ABBA"), true); + equal(dict.check("Abba's"), true); + equal(dict.check("Yum"), true); + equal(dict.check("yum"), true); + equal(dict.check("YUM"), true); + equal(dict.check("aa"), false); + equal(dict.check("aaron"), false); + equal(dict.check("abigael"), false); + equal(dict.check("YVES"), true); + equal(dict.check("yves"), false); + equal(dict.check("Yves"), true); + equal(dict.check("MACARTHUR"), true); + equal(dict.check("MacArthur"), true); + equal(dict.check("Alex"), true); + equal(dict.check("alex"), false); + }); + + it("Words not in the dictionary in any form are marked as misspelled.", function () { + equal(dict.check("aaraara"), false); + equal(dict.check("aaraara"), false); + equal(dict.check("aaraara"), false); + equal(dict.check("aaraara"), false); + equal(dict.check("aaraara"), false); + }); + + it("Leading and trailing whitespace is ignored.", function () { + equal(dict.check("concept "), true); + equal(dict.check(" concept"), true); + equal(dict.check(" concept"), true); + equal(dict.check("concept "), true); + equal(dict.check(" concept "), true); + }); + + it("ONLYINCOMPOUND flag is respected", function () { + equal(dict.check("1th"), false); + equal(dict.check("2th"), false); + equal(dict.check("3th"), false); + }); + + it("Compound words", function () { + equal(dict.check("1st"), true); + equal(dict.check("2nd"), true); + equal(dict.check("3rd"), true); + equal(dict.check("4th"), true); + equal(dict.check("5th"), true); + equal(dict.check("6th"), true); + equal(dict.check("7th"), true); + equal(dict.check("8th"), true); + equal(dict.check("9th"), true); + equal(dict.check("10th"), true); + equal(dict.check("11th"), true); + equal(dict.check("12th"), true); + equal(dict.check("13th"), true); + equal(dict.check("1th"), false); + equal(dict.check("2rd"), false); + equal(dict.check("3th"), false); + equal(dict.check("4rd"), false); + equal(dict.check("100st"), false); + }); + + it("Possessives are properly checked.", function () { + equal(dict.check("concept's"), true); + // acceptability's is in the dictionary including the 's + equal(dict.check("acceptability's's"), false); + }); + + it("Replacement rules are implemented", function () { + deepEqual(dict.suggest("wagh"), [ "weigh" ]); + deepEqual(dict.suggest("ceit"), [ "cat" ]); + deepEqual(dict.suggest("seau"), [ "so" ]); + deepEqual(dict.suggest("shaccable"), [ "shakable" ]); + deepEqual(dict.suggest("soker"), [ "choker" ]); + }); + + it("Contractions", function () { + equal(dict.check("aren't"), true); + equal(dict.check("I'm"), true); + equal(dict.check("we're"), true); + equal(dict.check("didn't"), true); + equal(dict.check("didn'ts"), false); + equal(dict.check("he're"), false); + }); + + it("Capitalizations are handled properly.", function () { + deepEqual(dict.suggest("Wagh"), ["Weigh"]); + deepEqual(dict.suggest("CEIT"), [ "CERT", "CHIT", "CIT", "CENT", "CUT" ]); + }); + + it("NOSUGGEST is respected", function () { + // 'fart' is marked NOSUGGEST, and I've confirmed that it would be in the suggestions if we don't respect that flag. + equal(dict.suggest("faxt").indexOf('fart'), -1); + + // If a NOSUGGEST word would be in the top 10 ('fart' is #5), Typo should still return the expected number of results. + equal(dict.suggest("faxt", 10).length, 10); + }); +} diff --git a/tests/british.html b/tests/british.html index f2979ecf..b00521ae 100644 --- a/tests/british.html +++ b/tests/british.html @@ -6,7 +6,7 @@ - +
diff --git a/tests/english.html b/tests/english.html index 959d28be..2818ec1f 100644 --- a/tests/english.html +++ b/tests/english.html @@ -6,7 +6,7 @@ - + diff --git a/tests/english.js b/tests/english.js index c09b9729..d3215155 100644 --- a/tests/english.js +++ b/tests/english.js @@ -1,13 +1,13 @@ function run() { var utilityDict = new Typo(); - var affData = utilityDict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.aff")); - var wordData = utilityDict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.dic")); + var affData = utilityDict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.aff")); + var wordData = utilityDict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.dic")); var hashDict = new Typo("en_US", affData, wordData); testDictionary(hashDict); - var dict = new Typo("en_US", null, null, { dictionaryPath : "../typo/dictionaries", asyncLoad : true, loadedCallback : function () { + var dict = new Typo("en_US", null, null, { dictionaryPath : "../src/dictionaries", asyncLoad : true, loadedCallback : function () { testDictionary(dict); }}); } diff --git a/tests/french.html b/tests/french.html index e2e32578..a016ed5d 100644 --- a/tests/french.html +++ b/tests/french.html @@ -6,7 +6,7 @@ - + diff --git a/tests/general.html b/tests/general.html index 1eb2b8fc..6a4769ae 100644 --- a/tests/general.html +++ b/tests/general.html @@ -6,7 +6,7 @@ - + diff --git a/tests/general.js b/tests/general.js index fe3a763c..b8ea4f45 100644 --- a/tests/general.js +++ b/tests/general.js @@ -20,12 +20,12 @@ function run() { }); test("_readFile can load a file synchronously", function() { - var data = empty_dict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.dic")); + var data = empty_dict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.dic")); ok(data && data.length > 0); }); asyncTest("_readFile can load a file asynchronously", function(assert) { - empty_dict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.dic"), null, true).then(function(data) { + empty_dict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.dic"), null, true).then(function(data) { assert.ok(data && data.length > 0); QUnit.start(); }, function(err) { @@ -41,8 +41,8 @@ function run() { } test("Dictionary instantiated with preloaded data is setup correctly", function() { - var affData = empty_dict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.aff")); - var wordData = empty_dict._readFile(chrome.extension.getURL("../typo/dictionaries/en_US/en_US.dic")); + var affData = empty_dict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.aff")); + var wordData = empty_dict._readFile(chrome.extension.getURL("../src/dictionaries/en_US/en_US.dic")); var dict = new Typo("en_US", affData, wordData); checkLoadedDict(dict); }); diff --git a/tests/german.html b/tests/german.html index 7abd4140..60162364 100644 --- a/tests/german.html +++ b/tests/german.html @@ -6,7 +6,7 @@ - + diff --git a/tests/latin.html b/tests/latin.html index 8a3bca0b..842f0850 100644 --- a/tests/latin.html +++ b/tests/latin.html @@ -6,7 +6,7 @@ - +