1 /* (This is the new BSD license.) 2 * Copyright (c) 2012-2014, Chris Culy 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the Chris Culy nor the 13 * names of its contributors may be used to endorse or promote 14 * products from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 "use strict"; 29 /** 30 @namespace textmodel 31 All of the functionality is in the textmodel namespace 32 */ 33 var textmodel = textmodel || {}; 34 35 (function(){ 36 //split string on spaces (assume already tokenized/analyzed, and that caseSensitive is only for token, which is in first position) 37 /** 38 * @class textmodel.TextHash 39 * This is the class for the TextHash document model. 40 * <p> 41 * It is used in the samples as a way of providing information to the {@doubletree.DoubleTree} to create its {@doubletree.Trie} data structure. 42 * It is not a <em>necessary</em> component of DoubleTree — all that really matters is that the relevant {@doubletree.Trie}s get created. 43 * For example, instead of TextHash, a database query might provide the input to {@doubletree.Trie}. 44 * <p> 45 * The TextHash maintains a hash of the data items where the keys are based on the distinguishing fields. The data parameters, except for baseField, 46 * should be the same as those used in the DoubleTree that will visualize the data. 47 * <p> 48 * "Items" are types, which are associated with token ids. Some functions expect an full item as an argument. Others expect a key as an argument. {@link #itemToIndex} converts a full item to a key. 49 * <p> 50 * @property {number} numTypes The number of types (readOnly) 51 * @property {number} numTokens The number of tokens (readOnly) 52 * 53 * @param string the input string, where each item is separated by whitespace. 54 * @param caseSensitive is the comparison of the baseField case sensitive or not 55 * @param fldNames the names of the fields in the data items 56 * @param fldDelim the field delimter in the data items. Note: it cannot be a whitespace (e.g. tab), since whitespace is used to delimit items 57 * @param distinguishingFldsArray the fields that determine identity 58 * @param baseField the primary field for comparison and display (typically token or lemma, but also possibly part of speech) 59 * @param useRecords blank lines are treated as delimiting units (records) in the text. Default is false 60 */ 61 textmodel.TextHash = function(string, caseSensitive, fldNames, fldDelim, distinguishingFldsArray, baseField, useRecords) { 62 var that = this; 63 var kIDKey = "id" + fldDelim; 64 var kRecDelim = String.fromCharCode(30); 65 this.useRecords = useRecords && true; 66 67 this.baseField = baseField; 68 this.baseFieldIdx = fldNames.indexOf(baseField); 69 if (this.baseFieldIdx == -1) { 70 this.baseFieldIdx = 0; 71 } 72 if (this.useRecords) { 73 string = string.replace(/\s*\n\n+/g, " " + kRecDelim + " "); //i.e. treat kRecDelim as a token 74 } 75 this.items = string.replace(/\s*\n\s*/g," ").trim().split(" "); //array of the items (tokens) 76 77 this.itemObjs = []; //object representations of the items. For now duplicating items, but maybe we can combine them 78 this.indices = {}; //for each type, a list of its token positions 79 80 this.numTypes = 0; //filled in below 81 this.numTokens = this.items.length; 82 83 var numItems = this.items.length; 84 var lastItem = numItems-1; 85 if (string != "") { //string is empty if we will load from JSON 86 for(var i=0;i<numItems;i++) { 87 88 var item = convertItem(this.items[i]); 89 this.itemObjs[i] = itemToObject(item, i); 90 91 if (! (item in this.indices && this.indices[item] instanceof Array)) { //we need the instanceof Array so we can override Object properties -- hope that none are already arrays 92 this.indices[item] = []; 93 } 94 try { 95 this.indices[item].push(i); 96 } catch (e) { 97 console.log("Couldn't add: " + item); 98 } 99 100 } 101 } 102 103 this.numTypes = Object.keys( this.indices ).length; 104 105 //this expects a full form 106 /** 107 * is the item in the model 108 * @param item a full item (not a key) 109 * @returns true if the item is in the model, false otherwise 110 */ 111 this.containsItem = function(item) { 112 return convertItem(item) in this.indices; 113 } 114 115 //this expects an index form 116 /** 117 * is the item key in the model 118 * @param item a key (not a full item) 119 * @returns true if the item key is in the model, false otherwise 120 */ 121 this.containsIndex = function(item) { 122 return item in this.indices; 123 } 124 125 //includeOnly is optional object with keys of lines *in this result* to include -- used for filtering after have results 126 //TBD: use (d3.)set instead of this dumb hash 127 128 //return array of [array of prefixes, array of item, array ofsuffixes], where prefixes and suffixes are of length contextLen; exact match only for now 129 130 //this expects an index form 131 /** 132 * get the information associated with an item 133 * <p> 134 * The information is an array of preceding items, an array of matching items, and an array of following items, 135 * where the preceding and following items are of length contextLen. 136 * @param item a key (not a full item) 137 * @param contextLen the length of the preceding and following context to be returned 138 * @param includeOnly an object where the keys are the item ids to be included (optional) 139 * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item 140 * @param contextFilters an object with "include" OR "exclude" which have objects whose keys are fields and whose values are arrays of values of those fields e.g. {"include":{"POS":["NN","NNS"]}} would include in the context only those items whose POS is NN or NNS. 141 * Similarly, "leftEnd" and "rtEnd" (both are possible together) indicate properties determining the left and right end points of the context, excluding those elements. So {"leftEnd":{"POS":["SENT"]}, "rtEnd":{"POS":["MD"]}} would include in the left context elements up to but not including the first SENT POS, and in the right context elements up to but not including the first MD POS. 142 * @param maxRandomHits how many random hits to return. -1 or null to return all 143 * @param puncToExclude a string of punctuation to exclude from the base field (will override any punctuation allowed via "include" in contextFilters). Default is null (i.e. include all punctuation) 144 * @returns array of [array of prefixes, array of item, array of suffixes, array of ids] 145 */ 146 this.getItem = function(item, contextLen, includeOnly, itemIsRegex, contextFilters, maxRandomHits, puncToExclude) { 147 var prefixArray = [], itemArray = [], suffixArray = [], idArray = []; 148 149 if (that.useRecords) { 150 var field1 = fldNames[0]; 151 152 if (contextFilters == null) { 153 contextFilters = {}; 154 } 155 if (typeof contextFilters["leftEnd"] === "undefined") { 156 contextFilters["leftEnd"] = {}; 157 } 158 var tmp = contextFilters["leftEnd"][field1]; 159 if (typeof contextFilters["leftEnd"][field1] === "undefined") { 160 contextFilters["leftEnd"][field1] = []; 161 } 162 contextFilters["leftEnd"][field1].push(kRecDelim); 163 164 if (typeof contextFilters["rtEnd"] === "undefined") { 165 contextFilters["rtEnd"] = {}; 166 } 167 if (typeof contextFilters["rtEnd"][field1] === "undefined") { 168 contextFilters["rtEnd"][field1] = []; 169 } 170 contextFilters["rtEnd"][field1].push(kRecDelim); 171 172 173 } 174 175 var filterPunc = function(startIndex) { 176 if (!puncToExclude) { 177 return function() {return true;}; //i.e. no filtering 178 } 179 180 var re = new RegExp("^[" + puncToExclude + "]$"); 181 182 return function(elt,idx,contextArray) { 183 return ! re.test(that.itemObjs[idx+startIndex][that.baseField]); 184 } 185 }; 186 187 var filterContext = function(startIndex){ 188 return function() {return true;} 189 }; //i.e. no filtering 190 var chopLeftEnd = function(a) { 191 return a; 192 }; //i.e. no filtering 193 var chopRtEnd = function(a) { 194 return a; 195 }; //i.e. no filtering 196 197 if (contextFilters) { 198 if (contextFilters["include"]) { 199 var toInclude = contextFilters["include"]; 200 var filterContext = function(startIndex) { //don't really need the var here, since have it above, but jsdoc thought it was global 201 return function(elt,idx,contextArray) { 202 for (var fld in toInclude) { 203 if (toInclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 204 return true; 205 } 206 } 207 return false; 208 } 209 }; 210 211 } else if (contextFilters["exclude"]) { 212 var toExclude = contextFilters["exclude"]; 213 filterContext = function(startIndex) { 214 return function(elt,idx,contextArray) { 215 for (var fld in toExclude) { 216 //var tmp = that.itemObjs[idx+startIndex]; 217 if (toExclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 218 return false; 219 } 220 } 221 return true; 222 } 223 }; 224 } 225 226 if (contextFilters["leftEnd"]) { 227 chopLeftEnd = function(a, startIndex) { 228 var where = 0; 229 var OK = true; 230 for (var idx=a.length-1;idx>-1;idx--) { 231 for (var fld in contextFilters["leftEnd"]) { 232 if (contextFilters["leftEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 233 OK = false; 234 } 235 } 236 if (!OK) { 237 where = idx +1; 238 break; 239 } 240 } 241 return a.slice(where,a.length+1); 242 } 243 } 244 245 246 if (contextFilters["rtEnd"]) { 247 chopRtEnd = function(a, startIndex) { 248 249 var OK = true; 250 return a.filter(function(elt,idx) { 251 if (!OK) { 252 return false; 253 } 254 for (var fld in contextFilters["rtEnd"]) { 255 if (contextFilters["rtEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 256 OK = false; 257 break; 258 } 259 } 260 return OK; 261 }); 262 } 263 } 264 } 265 266 var numItems = this.items.length; 267 var lastItem = numItems-1; 268 269 var hits; 270 if (itemIsRegex) { 271 var thisThat = this; 272 var re = new RegExp(item,'i'); //TBD 'i' is probably not right ... 273 var hitArrays = Object.keys(this.indices).filter(function(item){ 274 return re.test(item); 275 }) 276 .map(function(hit) { 277 return thisThat.indices[hit]; 278 }) 279 //.flatten(); //TBD use this if we can get it from a library, like Underscore.js 280 281 hits = []; 282 hitArrays.forEach(function(arr){ 283 arr.forEach(function(elt){ 284 hits.push(elt); 285 }); 286 }); 287 288 289 thisThat = null; 290 } else { 291 hits = this.indices[item]; 292 } 293 294 295 296 var n = hits.length; 297 var nMinus1 = n - 1; 298 299 for (var i = 0; i < n; i++) { 300 var thisIndex = hits[i]; 301 var origItem = this.items[thisIndex]; 302 var thisID = this.itemObjs[thisIndex][kIDKey]; 303 if (includeOnly != null && ! (thisID in includeOnly) ) { 304 continue; 305 } 306 307 itemArray.push( origItem ); 308 idArray.push(thisID); 309 310 if (thisIndex == 0) { 311 prefixArray.push( [] ); 312 } else { 313 var pStart = Math.max(0, thisIndex - contextLen); //start of prefix 314 var prefs = chopLeftEnd(this.items.slice(pStart, thisIndex), pStart); 315 pStart += contextLen - prefs.length; 316 prefixArray.push( prefs.filter(function(elt,idx,contextArray) { 317 return filterPunc(pStart)(elt,idx,contextArray) && filterContext(pStart)(elt,idx,contextArray); 318 }) ); 319 } 320 321 if (thisIndex == lastItem) { 322 suffixArray.push( [] ); 323 } else { 324 var sEnd = Math.min(lastItem, thisIndex + contextLen); 325 var sStart = thisIndex + 1; 326 var suffs = chopRtEnd(this.items.slice(sStart, sEnd + 1), sStart); 327 suffixArray.push( suffs.filter(function(elt,idx,contextArray) { 328 return filterPunc(sStart)(elt,idx,contextArray) && filterContext(sStart)(elt,idx,contextArray); 329 }) ); 330 } 331 } 332 333 //select the number of random hits 334 if (maxRandomHits !== null && maxRandomHits > 0) { 335 //create an array 0..n, shuffle it, then take the first maxRandomHits 336 var indices = d3.shuffle(d3.range(0,itemArray.length)).slice(0,maxRandomHits); 337 338 //now permute our originals 339 prefixArray = d3.permute(prefixArray, indices); 340 itemArray = d3.permute(itemArray, indices); 341 suffixArray = d3.permute(suffixArray, indices); 342 idArray = d3.permute(idArray, indices); 343 } 344 345 return [prefixArray, itemArray, suffixArray, idArray]; 346 } 347 348 //this expects a regular expression to match against index forms 349 /** 350 * Convenience function for textmodel.TextHash#getItem with isRegex=true 351 */ 352 this.getItems = function(regex, contextLen, includeOnly, contextFilters, maxRandomHits, puncToExclude) { 353 return this.getItem(regex, contextLen, includeOnly, true, contextFilters, maxRandomHits, puncToExclude); 354 } 355 356 //this expects an index form 357 //return string of context around single hit 358 /** 359 * get a string of context around a single hit 360 * @param item the item key (not a full form) 361 * @param contextLen the length of the preceding and following context to include 362 * @param id the id of the hit to return 363 * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item (This should match the value in the original query) 364 * @returns string of context around the item with id, including the item itself 365 */ 366 this.getItemContext = function(item, contextLen, id, itemIsRegex) { 367 //item = convertItem(item); 368 var toGet = {}; 369 toGet[id] = true; 370 var results = this.getItem(item,contextLen,toGet, itemIsRegex); 371 372 var what = results[0][0].join(" ") + " " + results[1][0] + " " + results[2][0].join(" "); 373 return what; 374 } 375 376 //TBD ?? factor out sort case-insensitive sort, since it occurs twice 377 378 /** 379 * get the unique item keys in the model 380 * @returns a sorted (case insensitive) array of item keys 381 */ 382 this.getUniqItems = function() { 383 return Object.keys( this.indices ).sort(function(A,B){ 384 var a = A.toLocaleLowerCase(); 385 var b = B.toLocaleLowerCase(); 386 if (a<b) return -1; 387 if (a>b) return 1; 388 return 0; 389 }).filter(function(i){return i !== kRecDelim}); 390 } 391 392 //append \t and the count to the item 393 /** 394 * get the unique item keys in the model, each followed by tab and its token count 395 * @returns a sorted (case insensitive) array of item keys with their token counts 396 */ 397 this.getUniqItemsWithCounts = function() { 398 var what = []; 399 for(item in this.indices ) { 400 what.push(item + "\t" + this.indices[item].length); 401 } 402 403 what.sort(function(A,B){ 404 var a = A.toLocaleLowerCase(); 405 var b = B.toLocaleLowerCase(); 406 if (a<b) return -1; 407 if (a>b) return 1; 408 return 0; 409 }); 410 411 return what; 412 } 413 414 /** 415 * make this TextHash have the values of a (previously saved) TextHash JSON object 416 * @param obj the TextHash JSON object 417 */ 418 this.fromJSON = function(obj) { 419 420 this.baseField = obj.baseField; 421 this.baseFieldIdx = obj.baseFieldIdx; 422 this.items = obj.items; 423 this.itemObjs = obj.itemObjs; 424 this.indices = obj.indices; 425 this.numTypes = obj.numTypes; 426 this.numTokens = obj.numTokens; 427 this.useRecords = obj.useRecords; 428 } 429 430 //returns the index form of item, e.g. for when resetting predictive UI 431 /** 432 * convert a full item to its key form 433 * @param item the full item 434 * @returns the key form of the item 435 */ 436 this.itemToIndex = function(item) { 437 return convertItem(item); 438 } 439 440 /** @private */ 441 function convertItem(inItem) { 442 443 var flds = inItem.split(fldDelim); 444 var item = flds[that.baseFieldIdx]; 445 if (! caseSensitive) { 446 item = item.toLocaleLowerCase(); 447 } 448 449 for(var f=0,n=flds.length;f<n;f++) { 450 if (f == that.baseFieldIdx) { 451 continue; 452 } 453 if ( distinguishingFldsArray.indexOf( fldNames[f] ) > -1) { 454 item += ("" + fldDelim + flds[f]); 455 } 456 } 457 return item; 458 } 459 460 /** @private */ 461 function itemToObject(inItem, id) { //we need to define this early, since it gets used in the constructor 462 var flds = inItem.split(fldDelim); 463 var what = {}; 464 flds.forEach(function(fld, idx){ 465 what[ fldNames[idx] ] = fld; 466 }); 467 what[kIDKey] = id; 468 return what; 469 } 470 471 } 472 })(); 473