1 /* (This is the new BSD license.)
  2 * Copyright (c) 2012-2014, Chris Culy
  3 * All rights reserved.
  4 *
  5 * Redistribution and use in source and binary forms, with or without
  6 * modification, are permitted provided that the following conditions are met:
  7 *     * Redistributions of source code must retain the above copyright
  8 *       notice, this list of conditions and the following disclaimer.
  9 *     * Redistributions in binary form must reproduce the above copyright
 10 *       notice, this list of conditions and the following disclaimer in the
 11 *       documentation and/or other materials provided with the distribution.
 12 *     * Neither the name of the Chris Culy nor the 
 13 *		names of its contributors may be used to endorse or promote 
 14 *		products from this software without specific prior written permission.
 15 *
 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy
 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy
 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 */
 27 
 28 "use strict";
 29 /**
 30  @namespace textmodel
 31  All of the functionality is in the textmodel namespace
 32 */
 33 var textmodel = textmodel || {};
 34 
 35 (function(){
 36 //split string on spaces (assume already tokenized/analyzed, and that caseSensitive is only for token, which is in first position)
 37  /**
 38   * @class textmodel.TextHash
 39   *  This is the class for the TextHash document model.
 40   * <p>
 41   *  It is used in the samples as a way of providing information to the {@doubletree.DoubleTree} to create its {@doubletree.Trie} data structure.
 42   *  It is not a <em>necessary</em> component of DoubleTree — all that really matters is that the relevant {@doubletree.Trie}s get created.
 43   *  For example, instead of TextHash, a database query might provide the input to {@doubletree.Trie}.
 44   *  <p>
 45   *  The TextHash maintains a hash of the data items where the keys are based on the distinguishing fields. The data parameters, except for baseField, 
 46   *  should be the same as those used in the DoubleTree that will visualize the data.
 47   *  <p>
 48   *  "Items" are types, which are associated with token ids. Some functions expect an full item as an argument. Others expect a key as an argument. {@link #itemToIndex} converts a full item to a key.
 49   *  <p>
 50   *  @property {number} numTypes The number of types (readOnly)
 51   *  @property {number} numTokens The number of tokens (readOnly)
 52   *
 53   * @param string the input string, where each item is separated by whitespace.
 54   * @param caseSensitive is the comparison of the baseField case sensitive or not
 55   * @param fldNames the names of the fields in the data items
 56   * @param fldDelim the field delimter in the data items. Note: it cannot be a whitespace (e.g. tab), since whitespace is used to delimit items
 57   * @param distinguishingFldsArray the fields that determine identity
 58   * @param baseField the primary field for comparison and display (typically token or lemma, but also possibly part of speech)
 59   * @param useRecords blank lines are treated as delimiting units (records) in the text. Default is false
 60   */
 61 textmodel.TextHash = function(string, caseSensitive, fldNames, fldDelim, distinguishingFldsArray, baseField, useRecords) {
 62     var that = this;
 63     var kIDKey = "id" + fldDelim;
 64     var kRecDelim = String.fromCharCode(30);
 65     this.useRecords = useRecords && true;
 66     
 67     this.baseField = baseField;
 68     this.baseFieldIdx = fldNames.indexOf(baseField);
 69     if (this.baseFieldIdx == -1) {
 70 	this.baseFieldIdx = 0;
 71     }
 72     if (this.useRecords) {
 73         string = string.replace(/\s*\n\n+/g, " " + kRecDelim + " ");  //i.e. treat kRecDelim as a token
 74     }
 75     this.items = string.replace(/\s*\n\s*/g," ").trim().split(" "); //array of the items (tokens)
 76     
 77     this.itemObjs = []; //object representations of the items. For now duplicating items, but maybe we can combine them
 78     this.indices = {}; //for each type, a list of its token positions
 79     
 80     this.numTypes = 0; //filled in below
 81     this.numTokens = this.items.length;
 82     
 83     var numItems = this.items.length;
 84     var lastItem = numItems-1;
 85     if (string != "") { //string is empty if we will load from JSON
 86 	for(var i=0;i<numItems;i++) {
 87 	    
 88 	    var item = convertItem(this.items[i]);
 89          this.itemObjs[i] = itemToObject(item, i);
 90 	    
 91 	    if (! (item in this.indices && this.indices[item] instanceof Array)) { //we need the instanceof Array so we can override Object properties -- hope that none are already arrays
 92 		this.indices[item] = [];
 93 	    }
 94 	    try {
 95 		this.indices[item].push(i);
 96 	    } catch (e) {
 97 		console.log("Couldn't add: " + item);
 98 	    }
 99 	    
100 	}
101     }
102     
103     this.numTypes = Object.keys( this.indices ).length;
104     
105     //this expects a full form
106     /**
107      * is the item in the model
108      * @param item a full item (not a key)
109      * @returns true if the item is in the model, false otherwise
110      */
111     this.containsItem = function(item) {
112 	return convertItem(item) in this.indices;
113     }
114     
115     //this expects an index form
116     /**
117      * is the item key in the model
118      * @param item a key (not a full item)
119      * @returns true if the item key is in the model, false otherwise
120      */
121     this.containsIndex = function(item) {
122 	return item in this.indices;
123     }
124        
125     //includeOnly is optional object with keys of lines *in this result* to include -- used for filtering after have results
126     //TBD: use (d3.)set instead of this dumb hash
127     
128     //return array of [array of prefixes, array of item, array ofsuffixes], where prefixes and suffixes are of length contextLen; exact match only for now
129     
130     //this expects an index form
131     /**
132      * get the information associated with an item
133      * <p>
134      * The information is an array of preceding items, an array of matching items, and an array of following items,
135      * where the preceding and following items are of length contextLen.
136      * @param item a key (not a full item)
137      * @param contextLen the length of the preceding and following context to be returned
138      * @param includeOnly an object where the keys are the item ids to be included (optional)
139      * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item
140      * @param contextFilters an object with "include" OR "exclude" which have objects whose keys are fields and whose values are arrays of values of those fields e.g. {"include":{"POS":["NN","NNS"]}} would include in the context only those items whose POS is NN or NNS.
141      * Similarly, "leftEnd" and "rtEnd" (both are possible together) indicate properties determining the left and right end points of the context, excluding those elements. So {"leftEnd":{"POS":["SENT"]}, "rtEnd":{"POS":["MD"]}} would include in the left context elements up to but not including the first SENT POS, and in the right context elements up to but not including the first MD POS.
142      * @param maxRandomHits how many random hits to return. -1 or null to return all
143      * @param puncToExclude a string of punctuation to exclude from the base field (will override any punctuation allowed via "include" in contextFilters). Default is null (i.e. include all punctuation)
144      * @returns array of [array of prefixes, array of item, array of suffixes, array of ids]
145      */
146     this.getItem = function(item, contextLen, includeOnly, itemIsRegex, contextFilters, maxRandomHits, puncToExclude) {
147         var prefixArray = [], itemArray = [], suffixArray = [], idArray = [];
148        
149        if (that.useRecords) {
150             var field1 = fldNames[0];
151             
152             if (contextFilters == null) {
153                 contextFilters = {};
154             }
155             if (typeof contextFilters["leftEnd"] === "undefined") {
156                 contextFilters["leftEnd"] = {};
157             }
158             var tmp = contextFilters["leftEnd"][field1];
159             if (typeof contextFilters["leftEnd"][field1] === "undefined") {
160                 contextFilters["leftEnd"][field1] = [];
161             }
162             contextFilters["leftEnd"][field1].push(kRecDelim);
163             
164             if (typeof contextFilters["rtEnd"] === "undefined") {
165                 contextFilters["rtEnd"] = {};
166             }
167             if (typeof contextFilters["rtEnd"][field1] === "undefined") {
168                 contextFilters["rtEnd"][field1] = [];
169             }
170             contextFilters["rtEnd"][field1].push(kRecDelim);
171             
172             
173        }
174        
175         var filterPunc = function(startIndex) {
176             if (!puncToExclude) {
177                 return function() {return true;}; //i.e. no filtering
178             }
179             
180             var re = new RegExp("^[" + puncToExclude + "]$");
181             
182             return function(elt,idx,contextArray) {
183                 return ! re.test(that.itemObjs[idx+startIndex][that.baseField]);
184             }
185         };
186         
187         var filterContext = function(startIndex){
188             return function() {return true;}
189         }; //i.e. no filtering
190         var chopLeftEnd = function(a) {
191             return a;
192         }; //i.e. no filtering
193          var chopRtEnd = function(a) {
194             return a;
195         }; //i.e. no filtering
196         
197         if (contextFilters) {            
198             if (contextFilters["include"]) {
199                 var toInclude = contextFilters["include"];
200                 var filterContext = function(startIndex) { //don't really need the var here, since have it above, but jsdoc thought it was global
201                     return function(elt,idx,contextArray) {
202                         for (var fld in toInclude) {
203                             if (toInclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
204                                 return true;
205                             }
206                         }
207                         return false;
208                     }
209                 };
210                 
211             } else if (contextFilters["exclude"]) {
212                 var toExclude = contextFilters["exclude"];
213                 filterContext = function(startIndex) {
214                     return function(elt,idx,contextArray) {
215                         for (var fld in toExclude) {
216                             //var tmp = that.itemObjs[idx+startIndex];
217                             if (toExclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
218                                 return false;
219                             }
220                         }
221                         return true;
222                     }
223                 };
224             }
225         
226             if (contextFilters["leftEnd"]) {
227                 chopLeftEnd = function(a, startIndex) {
228                     var where = 0;
229                     var OK = true;
230                     for (var idx=a.length-1;idx>-1;idx--) {
231                         for (var fld in contextFilters["leftEnd"]) {
232                             if (contextFilters["leftEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
233                                 OK = false;
234                             }
235                         }
236                         if (!OK) {
237                             where = idx +1;
238                             break;
239                         }
240                     }
241                     return a.slice(where,a.length+1);
242                 }
243             }
244             
245            
246             if (contextFilters["rtEnd"]) {
247                 chopRtEnd = function(a, startIndex) {
248                     
249                     var OK = true;
250                     return a.filter(function(elt,idx) {
251                         if (!OK) {
252                             return false;
253                         }
254                         for (var fld in contextFilters["rtEnd"]) {
255                             if (contextFilters["rtEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
256                                 OK = false;
257                                 break;
258                             }
259                         }
260                         return OK;
261                     });
262                 }
263             }
264         }
265        
266         var numItems = this.items.length;
267         var lastItem = numItems-1;
268         
269         var hits;
270         if (itemIsRegex) {
271             var thisThat = this;
272             var re = new RegExp(item,'i'); //TBD 'i' is probably not right ...
273             var hitArrays = Object.keys(this.indices).filter(function(item){
274              return re.test(item);
275               })
276             .map(function(hit) {
277              return thisThat.indices[hit];
278             })
279             //.flatten(); //TBD use this if we can get it from a library, like Underscore.js
280             
281             hits = [];
282             hitArrays.forEach(function(arr){
283              arr.forEach(function(elt){
284                  hits.push(elt);    
285              });
286             });
287             
288             
289             thisThat = null;
290         } else {
291             hits = this.indices[item];
292         }
293         
294         
295         
296         var n = hits.length;
297         var nMinus1 = n - 1;
298 	
299         for (var i = 0; i < n; i++) {
300              var thisIndex = hits[i];
301             var origItem = this.items[thisIndex];
302             var thisID = this.itemObjs[thisIndex][kIDKey];
303             if (includeOnly != null && ! (thisID in includeOnly) ) {
304                 continue;
305             }
306            
307             itemArray.push( origItem );
308             idArray.push(thisID);
309 	    
310             if (thisIndex == 0) {
311                 prefixArray.push( [] );
312             } else {
313                 var pStart = Math.max(0, thisIndex - contextLen); //start of prefix
314                 var prefs = chopLeftEnd(this.items.slice(pStart, thisIndex), pStart);
315                 pStart += contextLen - prefs.length;
316                 prefixArray.push( prefs.filter(function(elt,idx,contextArray) {
317                     return filterPunc(pStart)(elt,idx,contextArray) && filterContext(pStart)(elt,idx,contextArray);
318                 }) );
319             }
320 
321             if (thisIndex == lastItem) {
322                 suffixArray.push( [] );
323             } else {
324                 var sEnd = Math.min(lastItem, thisIndex + contextLen);
325                 var sStart = thisIndex + 1;
326                 var suffs = chopRtEnd(this.items.slice(sStart, sEnd + 1), sStart);
327                 suffixArray.push( suffs.filter(function(elt,idx,contextArray) {
328                     return filterPunc(sStart)(elt,idx,contextArray) && filterContext(sStart)(elt,idx,contextArray);
329                 }) );
330             }
331         }
332         
333         //select the number of random hits
334         if (maxRandomHits !== null && maxRandomHits > 0) {
335             //create an array 0..n, shuffle it, then take the first maxRandomHits
336             var indices = d3.shuffle(d3.range(0,itemArray.length)).slice(0,maxRandomHits);
337             
338             //now permute our originals
339             prefixArray = d3.permute(prefixArray, indices);
340             itemArray = d3.permute(itemArray, indices);
341             suffixArray = d3.permute(suffixArray, indices);
342             idArray = d3.permute(idArray, indices);
343         }
344         
345         return [prefixArray, itemArray, suffixArray, idArray];
346     }
347     
348     //this expects a regular expression to match against index forms
349     /**
350      * Convenience function for textmodel.TextHash#getItem with isRegex=true
351      */
352     this.getItems = function(regex, contextLen, includeOnly, contextFilters, maxRandomHits, puncToExclude) {
353         return this.getItem(regex, contextLen, includeOnly, true, contextFilters, maxRandomHits, puncToExclude);
354     }
355     
356     //this expects an index form
357     //return string of context around single hit
358     /**
359      * get a string of context around a single hit
360      * @param item the item key (not a full form)
361      * @param contextLen the length of the preceding and following context to include
362      * @param id the id of the hit to return
363      * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item (This should match the value in the original query)
364      * @returns string of context around the item with id, including the item itself
365      */
366     this.getItemContext = function(item, contextLen, id, itemIsRegex) {
367 	//item = convertItem(item);
368         var toGet = {};
369 	toGet[id] = true;
370         var results = this.getItem(item,contextLen,toGet, itemIsRegex);
371         
372         var what = results[0][0].join(" ") + " " + results[1][0] + " " + results[2][0].join(" ");
373         return what;
374     }
375     
376     //TBD ?? factor out sort case-insensitive sort, since it occurs twice
377     
378     /**
379      * get the unique item keys in the model
380      * @returns a sorted (case insensitive) array of item keys
381      */
382     this.getUniqItems = function() {
383         return Object.keys( this.indices ).sort(function(A,B){
384             var a = A.toLocaleLowerCase();
385             var b = B.toLocaleLowerCase();
386             if (a<b) return -1;
387             if (a>b) return 1;
388             return 0;
389         }).filter(function(i){return i !== kRecDelim});
390     }
391     
392     //append \t and the count to the item
393     /**
394      * get the unique item keys in the model, each followed by tab and its token count
395      * @returns a sorted (case insensitive) array of item keys with their token counts
396      */
397     this.getUniqItemsWithCounts = function() {
398         var what = [];
399         for(item in this.indices ) {
400             what.push(item + "\t" + this.indices[item].length);
401         }
402         
403         what.sort(function(A,B){
404             var a = A.toLocaleLowerCase();
405             var b = B.toLocaleLowerCase();
406             if (a<b) return -1;
407             if (a>b) return 1;
408             return 0;
409         });
410         
411         return what;
412     }
413     
414     /**
415      * make this TextHash have the values of a (previously saved) TextHash JSON object
416      * @param obj the TextHash JSON object
417      */
418     this.fromJSON = function(obj) {
419 	
420 	this.baseField = obj.baseField;
421 	this.baseFieldIdx = obj.baseFieldIdx;
422 	this.items = obj.items;
423      this.itemObjs = obj.itemObjs;
424 	this.indices = obj.indices;
425 	this.numTypes = obj.numTypes;
426 	this.numTokens = obj.numTokens;
427      this.useRecords = obj.useRecords;
428     }
429     
430     //returns the index form of item, e.g. for when resetting predictive UI
431     /**
432      * convert a full item to its key form
433      * @param item the full item
434      * @returns the key form of the item
435      */
436     this.itemToIndex = function(item) {
437 	return convertItem(item);
438     }
439     
440     /** @private */
441     function convertItem(inItem) {
442 	
443 	var flds = inItem.split(fldDelim);
444 	var item = flds[that.baseFieldIdx];
445 	if (! caseSensitive) {
446 	    item = item.toLocaleLowerCase();
447 	}
448 	
449         for(var f=0,n=flds.length;f<n;f++) {
450             if (f == that.baseFieldIdx) {
451                 continue;
452             }
453             if ( distinguishingFldsArray.indexOf( fldNames[f] ) > -1) {
454                 item += ("" + fldDelim + flds[f]);
455             }
456         }
457 	return item;
458     }
459     
460      /** @private */
461     function itemToObject(inItem, id) { //we need to define this early, since it gets used in the constructor
462         var flds = inItem.split(fldDelim);
463         var what = {};
464         flds.forEach(function(fld, idx){
465             what[ fldNames[idx] ] = fld;
466         });
467         what[kIDKey] = id;
468         return what;
469     }
470     
471 }
472 })();
473