1 /* (This is the new BSD license.) 2 * Copyright (c) 2012-2014, Chris Culy 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the Chris Culy nor the 13 * names of its contributors may be used to endorse or promote 14 * products from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 "use strict"; 29 /** 30 @namespace textmodel 31 All of the functionality is in the textmodel namespace 32 */ 33 var textmodel = textmodel || {}; 34 35 (function(){ 36 //split string on spaces (assume already tokenized/analyzed, and that caseSensitive is only for token, which is in first position) 37 /** 38 * @class textmodel.TextHash 39 * This is the class for the TextHash document model. 40 * <p> 41 * It is used in the samples as a way of providing information to the {@doubletree.DoubleTree} to create its {@doubletree.Trie} data structure. 42 * It is not a <em>necessary</em> component of DoubleTree — all that really matters is that the relevant {@doubletree.Trie}s get created. 43 * For example, instead of TextHash, a database query might provide the input to {@doubletree.Trie}. 44 * <p> 45 * The TextHash maintains a hash of the data items where the keys are based on the distinguishing fields. The data parameters, except for baseField, 46 * should be the same as those used in the DoubleTree that will visualize the data. 47 * <p> 48 * "Items" are types, which are associated with token ids. Some functions expect an full item as an argument. Others expect a key as an argument. {@link #itemToIndex} converts a full item to a key. 49 * <p> 50 * @property {number} numTypes The number of types (readOnly) 51 * @property {number} numTokens The number of tokens (readOnly) 52 * 53 * @param string the input string, where each item is separated by whitespace. 54 * @param caseSensitive is the comparison of the baseField case sensitive or not 55 * @param fldNames the names of the fields in the data items 56 * @param fldDelim the field delimter in the data items. Note: it cannot be a whitespace (e.g. tab), since whitespace is used to delimit items 57 * @param distinguishingFldsArray the fields that determine identity 58 * @param baseField the primary field for comparison and display (typically token or lemma, but also possibly part of speech) 59 * @param useRecords blank lines are treated as delimiting units (records) in the text. Default is false 60 */ 61 textmodel.TextHash = function(string, caseSensitive, fldNames, fldDelim, distinguishingFldsArray, baseField, useRecords) { 62 var that = this; 63 var kRecDelim = String.fromCharCode(30); 64 this.useRecords = useRecords && true; 65 66 this.baseField = baseField; 67 this.baseFieldIdx = fldNames.indexOf(baseField); 68 if (this.baseFieldIdx == -1) { 69 this.baseFieldIdx = 0; 70 } 71 if (this.useRecords) { 72 string = string.replace(/\s*\n\n+/g, " " + kRecDelim + " "); //i.e. treat kRecDelim as a token 73 } 74 this.items = string.replace(/\s*\n\s*/g," ").trim().split(" "); //array of the items (tokens) 75 76 this.itemObjs = []; //object representations of the items. For now duplicating items, but maybe we can combine them 77 this.indices = {}; //for each type, a list of its token positions 78 79 this.numTypes = 0; //filled in below 80 this.numTokens = this.items.length; 81 82 var numItems = this.items.length; 83 var lastItem = numItems-1; 84 if (string != "") { //string is empty if we will load from JSON 85 for(var i=0;i<numItems;i++) { 86 87 var item = convertItem(this.items[i]); 88 this.itemObjs[i] = itemToObject(item); 89 90 if (! (item in this.indices && this.indices[item] instanceof Array)) { //we need the instanceof Array so we can override Object properties -- hope that none are already arrays 91 this.indices[item] = []; 92 } 93 try { 94 this.indices[item].push(i); 95 } catch (e) { 96 console.log("Couldn't add: " + item); 97 } 98 99 } 100 } 101 102 this.numTypes = Object.keys( this.indices ).length; 103 104 //this expects a full form 105 /** 106 * is the item in the model 107 * @param item a full item (not a key) 108 * @returns true if the item is in the model, false otherwise 109 */ 110 this.containsItem = function(item) { 111 return convertItem(item) in this.indices; 112 } 113 114 //this expects an index form 115 /** 116 * is the item key in the model 117 * @param item a key (not a full item) 118 * @returns true if the item key is in the model, false otherwise 119 */ 120 this.containsIndex = function(item) { 121 return item in this.indices; 122 } 123 124 //includeOnly is optional object with keys of lines *in this result* to include -- used for filtering after have results 125 //TBD: use (d3.)set instead of this dumb hash 126 127 //return array of [array of prefixes, array of item, array ofsuffixes], where prefixes and suffixes are of length contextLen; exact match only for now 128 129 //this expects an index form 130 /** 131 * get the information associated with an item 132 * <p> 133 * The information is an array of preceding items, an array of matching items, and an array of following items, 134 * where the preceding and following items are of length contextLen. 135 * @param item a key (not a full item) 136 * @param contextLen the length of the preceding and following context to be returned 137 * @param includeOnly an object where the keys are the item ids to be included (optional) 138 * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item 139 * @param contextFilters an object with "include" OR "exclude" which have objects whose keys are fields and whose values are arrays of values of those fields e.g. {"include":{"POS":["NN","NNS"]}} would include in the context only those items whose POS is NN or NNS. 140 * Similarly, "leftEnd" and "rtEnd" (both are possible together) indicate properties determining the left and right end points of the context, excluding those elements. So {"leftEnd":{"POS":["SENT"]}, "rtEnd":{"POS":["MD"]}} would include in the left context elements up to but not including the first SENT POS, and in the right context elements up to but not including the first MD POS. 141 * @param maxRandomHits how many random hits to return. -1 or null to return all 142 * @param puncToExclude a string of punctuation to exclude from the base field (will override any punctuation allowed via "include" in contextFilters). Default is null (i.e. include all punctuation) 143 * @returns array of [array of prefixes, array of item, array of suffixes, array of ids] 144 */ 145 this.getItem = function(item, contextLen, includeOnly, itemIsRegex, contextFilters, maxRandomHits, puncToExclude) { 146 var prefixArray = [], itemArray = [], suffixArray = [], idArray = []; 147 148 if (that.useRecords) { 149 var field1 = fldNames[0]; 150 151 if (contextFilters == null) { 152 contextFilters = {}; 153 } 154 if (typeof contextFilters["leftEnd"] === "undefined") { 155 contextFilters["leftEnd"] = {}; 156 } 157 var tmp = contextFilters["leftEnd"][field1]; 158 if (typeof contextFilters["leftEnd"][field1] === "undefined") { 159 contextFilters["leftEnd"][field1] = []; 160 } 161 contextFilters["leftEnd"][field1].push(kRecDelim); 162 163 if (typeof contextFilters["rtEnd"] === "undefined") { 164 contextFilters["rtEnd"] = {}; 165 } 166 if (typeof contextFilters["rtEnd"][field1] === "undefined") { 167 contextFilters["rtEnd"][field1] = []; 168 } 169 contextFilters["rtEnd"][field1].push(kRecDelim); 170 171 172 } 173 174 var filterPunc = function(startIndex) { 175 if (!puncToExclude) { 176 return function() {return true;}; //i.e. no filtering 177 } 178 179 var re = new RegExp("^[" + puncToExclude + "]$"); 180 181 return function(elt,idx,contextArray) { 182 return ! re.test(that.itemObjs[idx+startIndex][that.baseField]); 183 } 184 }; 185 186 var filterContext = function(startIndex){ 187 return function() {return true;} 188 }; //i.e. no filtering 189 var chopLeftEnd = function(a) { 190 return a; 191 }; //i.e. no filtering 192 var chopRtEnd = function(a) { 193 return a; 194 }; //i.e. no filtering 195 196 if (contextFilters) { 197 if (contextFilters["include"]) { 198 var toInclude = contextFilters["include"]; 199 var filterContext = function(startIndex) { //don't really need the var here, since have it above, but jsdoc thought it was global 200 return function(elt,idx,contextArray) { 201 for (var fld in toInclude) { 202 if (toInclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 203 return true; 204 } 205 } 206 return false; 207 } 208 }; 209 210 } else if (contextFilters["exclude"]) { 211 var toExclude = contextFilters["exclude"]; 212 filterContext = function(startIndex) { 213 return function(elt,idx,contextArray) { 214 for (var fld in toExclude) { 215 //var tmp = that.itemObjs[idx+startIndex]; 216 if (toExclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 217 return false; 218 } 219 } 220 return true; 221 } 222 }; 223 } 224 225 if (contextFilters["leftEnd"]) { 226 chopLeftEnd = function(a, startIndex) { 227 var where = 0; 228 var OK = true; 229 for (var idx=a.length-1;idx>-1;idx--) { 230 for (var fld in contextFilters["leftEnd"]) { 231 if (contextFilters["leftEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 232 OK = false; 233 } 234 } 235 if (!OK) { 236 where = idx +1; 237 break; 238 } 239 } 240 return a.slice(where,a.length+1); 241 } 242 } 243 244 245 if (contextFilters["rtEnd"]) { 246 chopRtEnd = function(a, startIndex) { 247 248 var OK = true; 249 return a.filter(function(elt,idx) { 250 if (!OK) { 251 return false; 252 } 253 for (var fld in contextFilters["rtEnd"]) { 254 if (contextFilters["rtEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) { 255 OK = false; 256 break; 257 } 258 } 259 return OK; 260 }); 261 } 262 } 263 } 264 265 var numItems = this.items.length; 266 var lastItem = numItems-1; 267 268 var hits; 269 if (itemIsRegex) { 270 var thisThat = this; 271 var re = new RegExp(item,'i'); //TBD 'i' is probably not right ... 272 var hitArrays = Object.keys(this.indices).filter(function(item){ 273 return re.test(item); 274 }) 275 .map(function(hit) { 276 return thisThat.indices[hit]; 277 }) 278 //.flatten(); //TBD use this if we can get it from a library, like Underscore.js 279 280 hits = []; 281 hitArrays.forEach(function(arr){ 282 arr.forEach(function(elt){ 283 hits.push(elt); 284 }); 285 }); 286 287 288 thisThat = null; 289 } else { 290 hits = this.indices[item]; 291 } 292 293 294 295 var n = hits.length; 296 var nMinus1 = n - 1; 297 298 for (var i = 0; i < n; i++) { 299 if (includeOnly != null && ! (i in includeOnly) ) { 300 continue; 301 } 302 303 idArray.push(i); 304 var thisIndex = hits[i]; 305 var origItem = this.items[thisIndex]; 306 itemArray.push( origItem ); 307 308 if (thisIndex == 0) { 309 prefixArray.push( [] ); 310 } else { 311 var pStart = Math.max(0, thisIndex - contextLen); //start of prefix 312 var prefs = chopLeftEnd(this.items.slice(pStart, thisIndex), pStart); 313 pStart += contextLen - prefs.length; 314 prefixArray.push( prefs.filter(function(elt,idx,contextArray) { 315 return filterPunc(pStart)(elt,idx,contextArray) && filterContext(pStart)(elt,idx,contextArray); 316 }) ); 317 } 318 319 if (thisIndex == lastItem) { 320 suffixArray.push( [] ); 321 } else { 322 var sEnd = Math.min(lastItem, thisIndex + contextLen); 323 var sStart = thisIndex + 1; 324 var suffs = chopRtEnd(this.items.slice(sStart, sEnd + 1), sStart); 325 suffixArray.push( suffs.filter(function(elt,idx,contextArray) { 326 return filterPunc(sStart)(elt,idx,contextArray) && filterContext(sStart)(elt,idx,contextArray); 327 }) ); 328 } 329 } 330 331 //select the number of random hits 332 if (maxRandomHits !== null && maxRandomHits > 0) { 333 //create an array 0..n, shuffle it, then take the first maxRandomHits 334 var indices = d3.shuffle(d3.range(0,itemArray.length)).slice(0,maxRandomHits); 335 336 //now permute our originals 337 prefixArray = d3.permute(prefixArray, indices); 338 itemArray = d3.permute(itemArray, indices); 339 suffixArray = d3.permute(suffixArray, indices); 340 idArray = d3.permute(idArray, indices); 341 } 342 343 return [prefixArray, itemArray, suffixArray, idArray]; 344 } 345 346 //this expects a regular expression to match against index forms 347 /** 348 * Convenience function for textmodel.TextHash#getItem with isRegex=true 349 */ 350 this.getItems = function(regex, contextLen, includeOnly, contextFilters, maxRandomHits, puncToExclude) { 351 return this.getItem(regex, contextLen, includeOnly, true, contextFilters, maxRandomHits, puncToExclude); 352 } 353 354 //this expects an index form 355 //return string of context around single hit 356 /** 357 * get a string of context around a single hit 358 * @param item the item key (not a full form) 359 * @param contextLen the length of the preceding and following context to include 360 * @param id the id of the hit to return 361 * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item (This should match the value in the original query) 362 * @returns string of context around the item with id, including the item itself 363 */ 364 this.getItemContext = function(item, contextLen, id, itemIsRegex) { 365 //item = convertItem(item); 366 var toGet = {}; 367 toGet[id] = true; 368 var results = this.getItem(item,contextLen,toGet, itemIsRegex); 369 370 var what = results[0][0].join(" ") + " " + results[1][0] + " " + results[2][0].join(" "); 371 return what; 372 } 373 374 //TBD ?? factor out sort case-insensitive sort, since it occurs twice 375 376 /** 377 * get the unique item keys in the model 378 * @returns a sorted (case insensitive) array of item keys 379 */ 380 this.getUniqItems = function() { 381 return Object.keys( this.indices ).sort(function(A,B){ 382 var a = A.toLocaleLowerCase(); 383 var b = B.toLocaleLowerCase(); 384 if (a<b) return -1; 385 if (a>b) return 1; 386 return 0; 387 }).filter(function(i){return i !== kRecDelim}); 388 } 389 390 //append \t and the count to the item 391 /** 392 * get the unique item keys in the model, each followed by tab and its token count 393 * @returns a sorted (case insensitive) array of item keys with their token counts 394 */ 395 this.getUniqItemsWithCounts = function() { 396 var what = []; 397 for(item in this.indices ) { 398 what.push(item + "\t" + this.indices[item].length); 399 } 400 401 what.sort(function(A,B){ 402 var a = A.toLocaleLowerCase(); 403 var b = B.toLocaleLowerCase(); 404 if (a<b) return -1; 405 if (a>b) return 1; 406 return 0; 407 }); 408 409 return what; 410 } 411 412 /** 413 * make this TextHash have the values of a (previously saved) TextHash JSON object 414 * @param obj the TextHash JSON object 415 */ 416 this.fromJSON = function(obj) { 417 418 this.baseField = obj.baseField; 419 this.baseFieldIdx = obj.baseFieldIdx; 420 this.items = obj.items; 421 this.indices = obj.indices; 422 this.numTypes = obj.numTypes; 423 this.numTokens = obj.numTokens; 424 this.useRecords = obj.useRecords; 425 } 426 427 //returns the index form of item, e.g. for when resetting predictive UI 428 /** 429 * convert a full item to its key form 430 * @param item the full item 431 * @returns the key form of the item 432 */ 433 this.itemToIndex = function(item) { 434 return convertItem(item); 435 } 436 437 /** @private */ 438 function convertItem(inItem) { 439 440 var flds = inItem.split(fldDelim); 441 var item = flds[that.baseFieldIdx]; 442 if (! caseSensitive) { 443 item = item.toLocaleLowerCase(); 444 } 445 446 for(var f=0,n=flds.length;f<n;f++) { 447 if (f == that.baseFieldIdx) { 448 continue; 449 } 450 if ( distinguishingFldsArray.indexOf( fldNames[f] ) > -1) { 451 item += ("" + fldDelim + flds[f]); 452 } 453 } 454 return item; 455 } 456 457 /** @private */ 458 function itemToObject(inItem) { //we need to define this early, since it gets used in the constructor 459 var flds = inItem.split(fldDelim); 460 var what = {}; 461 flds.forEach(function(fld, idx){ 462 what[ fldNames[idx] ] = fld; 463 }); 464 return what; 465 } 466 467 } 468 })(); 469