1 /* (This is the new BSD license.) 2 * Copyright (c) 2012-2014, Chris Culy 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the Chris Culy nor the 13 * names of its contributors may be used to endorse or promote 14 * products from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 "use strict"; 29 var doubletree = doubletree || {}; 30 31 (function(){ 32 33 /** 34 * @class doubletree.Trie 35 * This is the class for the specialized trie that is the data for {@link doubletree.DoubleTree}. 36 * The Trie will get pieces of data that contain fields. Some (possibly all) of those fields will be used for distinguishing among items. 37 * For example, we might have an "author" field, but not use it when distinguishing among items for the Trie. 38 * @param caseSensitive are the elements in the first distinguishing field compared in a case sensitive fashion 39 * @param fldNames the names of the fields 40 * @param fldDelim the field delimiter 41 * @distinguishingFldsArray the distinguishing fields 42 * @undistinguishedRoot true if the root should be calculated without using the distinguishing fields 43 */ 44 doubletree.Trie = function(caseSensitive, fldNames, fldDelim, distinguishingFldsArray, undistinguishedRoot) { 45 var endNG = " "; 46 var rootName = "_root_"; 47 var noCase = (! caseSensitive) && true; 48 if (! fldNames) { 49 fldNames = ["item"]; 50 } 51 var fieldNames = fldNames; 52 if (! fieldDelim) { 53 fieldDelim = "\t"; //default 54 } 55 var fieldDelim = fldDelim; 56 if (! distinguishingFieldsArray) { 57 distinguishingFieldsArray = [ fieldNames[0] ]; 58 } 59 var distinguishingFieldsArray = distinguishingFldsArray; 60 var undistinguishedRt = undistinguishedRoot; 61 if (undefined == undistinguishedRt) { 62 undistinguishedRt = true; //TBD: check to make sure this doesn't break anything 63 } 64 65 var trie = new TrieNode(rootName,-1,0); 66 67 /** @private */ 68 function TrieNode(item, id, count) { 69 this.id = id; 70 this.count = count; 71 this.info = {"count":count, "ids":{}}; 72 73 if (item == null) { 74 this.item = rootName; 75 } else { 76 this.item = item; 77 this.info.ids = {}; 78 this.info.ids[id] = true; 79 var flds = item.split(fieldDelim); 80 for(var i in flds) { 81 this.info[ fieldNames[i] ] = [ flds[i] ]; 82 } 83 } 84 this.nodes = {}; 85 86 /** @private */ 87 this.addNgram = function(itemArray, id, count) { 88 if (! count) { 89 count = 1; 90 } 91 var thisItem, thisKey; 92 if (itemArray.length > 0) { 93 thisItem = itemArray.shift(); 94 95 var theseFlds = thisItem.split(fieldDelim); 96 97 if (undistinguishedRt && this.item == rootName) { 98 thisKey = ""; 99 } else { 100 thisKey = theseFlds.filter(function(f,i) { 101 return distinguishingFieldsArray.indexOf( fieldNames[i] ) > -1; 102 }) 103 .map(function(f) { 104 if (noCase) { 105 return f.toLocaleLowerCase(); 106 } 107 return f; 108 }) 109 .join(fieldDelim); 110 } 111 112 } else { 113 thisItem = endNG; 114 thisKey = thisItem; 115 } 116 117 var subTrie; 118 if (thisKey in this.nodes && this.nodes[thisKey] instanceof TrieNode) { //we need the instanceof TrieNode so we can override Object properties -- hope that none are already arrays 119 subTrie = this.nodes[thisKey]; 120 subTrie.info.count += count; 121 subTrie.info.ids[id] = true; 122 123 for(var f in theseFlds) { 124 var thisFld = theseFlds[f]; 125 if (subTrie.info[ fieldNames[f] ].indexOf( thisFld ) == -1 ){ 126 subTrie.info[ fieldNames[f] ].push(thisFld); 127 } 128 } 129 130 131 } else { 132 subTrie = new TrieNode(thisItem, id, count); 133 this.nodes[thisKey] = subTrie; 134 } 135 if (thisItem != endNG) { 136 subTrie.addNgram(itemArray,id, count); 137 } 138 } 139 140 /** @private */ 141 this.getUniqRoot = function() { 142 if (this.item == rootName) { 143 var children = Object.keys(this.nodes); 144 if (children.length == 1) { 145 return this.nodes[ children[0] ]; 146 } 147 } 148 149 return this; 150 } 151 152 /** @private */ 153 this.toTree = function(filterFuns) { 154 155 function toTreeHelper(filterFuns, descendentLevel, trieData) { 156 157 var what = {"children":[]}; 158 what.name = trieData.item; 159 what.info = {}; 160 for(var k in trieData.info) { 161 if (typeof(trieData.info[k]) === 'Object') { 162 what.info[k] = {}; 163 for(var k2 in trieData.info[k]) { 164 what.info[k][k2] = this.info[k][k2]; 165 } 166 } else { 167 what.info[k] = trieData.info[k]; 168 } 169 } 170 what.pruned = {}; 171 172 173 for(var item in trieData.nodes) { 174 var itemNode = trieData.nodes[item]; 175 var thisFilter = filterFuns[descendentLevel]; 176 if ( ! thisFilter || (thisFilter && thisFilter(itemNode.info)) ) { 177 what.children.push( toTreeHelper(filterFuns, descendentLevel +1, itemNode) ); 178 if (itemNode.pruned != {}) { 179 addTo(what.pruned, itemNode.pruned); 180 } 181 } else { 182 addTo(what.pruned, itemNode.info.ids); 183 } 184 } 185 186 what.info.continuations = what.children.length; 187 //this is to record info we need for sizing the tree, since D3 automatically scales to fit, which is not what we want 188 //we also need to keep track of the minimum count (the root always has the max, of course), for scaling 189 if (what.children.length == 0) { 190 what.children = null; //the trees expect null if there are no children, not the empty array. Odd, but true. 191 what.maxChildren = 0; 192 193 if (what.name) { 194 what.maxLen = what.name.length; 195 } else { 196 what.maxLen = 0; 197 } 198 199 what.minCount = what.info.count; 200 //what.maxChildren = 0; //new 201 202 } else { 203 var cMax = d3.max( what.children.map(function(c) {return c.maxChildren;}) ); 204 what.maxChildren = Math.max(what.children.length, cMax); 205 206 var maxLen = d3.max( what.children.map(function(c) {return c.maxLen;})); 207 what.maxLen = Math.max(maxLen, what.name.length); 208 209 what.minCount = d3.min( what.children.map(function(c) {return c.minCount;})); //the children are always <= the parent 210 } 211 return what; 212 } 213 214 if (! filterFuns ) { 215 filterFuns = []; 216 } 217 218 var trieData = JSON.parse(JSON.stringify(this)); //CuC make a copy of the data, to keep the real trie immutable 219 220 return toTreeHelper(filterFuns, 0, trieData); 221 } 222 } 223 224 225 226 227 /** 228 * Add an ngram to the Trie 229 * @param itemArray an array of delimited items (the ngrams) 230 * @param id an id for this ngram 231 * @param count a count for this ngram. Default is 1 232 */ 233 this.addNgram = function(itemArray, id, count) {trie.addNgram(itemArray, id, count);}; 234 235 /** 236 * get the unique root of this Trie. Used only by {@link DoubleTree} 237 * @returns a new Trie with a unique item as the root 238 */ 239 this.getUniqRoot = function() { 240 var what = new doubletree.Trie((!noCase), fieldNames, fieldDelim, distinguishingFieldsArray); 241 what.trie( trie.getUniqRoot() ); 242 return what; 243 }; 244 245 /** 246 * convert the Trie to a tree structure for display. Used only by {@link DoubleTree} 247 * @param filterFuns the filtering functions to apply to the tree see {@link DoubleTree.filters} 248 * @param descendentLevel the current level we are filtering 249 * @returns the tree 250 */ 251 this.toTree = function(filterFuns, descendentLevel) {return trie.toTree(filterFuns, descendentLevel);}; 252 253 /** 254 * serialize the Trie as a JSON string 255 * @returns the JSON string representation of the Trie 256 */ 257 this.serialize = function() { 258 return JSON.stringify(this); 259 } 260 261 /** 262 * make this Trie have the values of a previously serialized Trie see {@link #serialize} 263 */ 264 this.deserialize = function(serialized) { 265 var obj = JSON.parse(serialized); 266 267 endNG = obj.endNG(); 268 rootName = obj.rootName(); 269 noCase = obj.caseSensitive(); 270 fieldNames = obj.fieldNames(); 271 fieldDelim = obj.fieldDelim(); 272 distinguishingFieldsArray = obj.distinguishingFieldsArray(); 273 trie = obj.trie(); 274 275 } 276 277 //getters -- the properties are readonly, set in constructor 278 279 //private, only used in deserialization 280 /** @private */ 281 this.endNG = function() { 282 return endNG; 283 } 284 //private, only used in deserialization 285 /** @private */ 286 this.rootName = function() { 287 return rootName; 288 } 289 290 //private, also a setter, only used in deserialization and getUniqRoot; 291 /** @private */ 292 this.trie = function(value) { 293 if (arguments.length > 0) { 294 trie = value; 295 } 296 return trie; 297 } 298 299 /** 300 * @returns whether this Trie uses case sensitive comparison 301 */ 302 this.caseSensitive = function() { 303 return ! noCase; 304 } 305 306 /** 307 * get the field names in the data 308 * @returns the field names in the data 309 */ 310 this.fieldNames = function() { 311 return fieldNames; 312 } 313 314 /** 315 * get the field delimiter for the data 316 * @returns the field delimiter for the data 317 */ 318 this.fieldDelim = function() { 319 return fieldDelim; 320 } 321 322 /** 323 * get the distinguishing fields for the data 324 * @returns the distinguishing fields for the data 325 */ 326 this.distinguishingFieldsArray = function() { 327 return distinguishingFieldsArray; 328 } 329 330 //add key/vals of o2 to o1 and return o1; (top level key-value only, o2 values maintained over o1) 331 /** @private */ 332 function addTo(o1, o2) { 333 for(var k in o2) { 334 o1[k] = o2[k]; 335 } 336 } 337 338 339 } 340 341 })(); 342