1 /* (This is the new BSD license.)
  2 * Copyright (c) 2012-2014, Chris Culy
  3 * All rights reserved.
  4 *
  5 * Redistribution and use in source and binary forms, with or without
  6 * modification, are permitted provided that the following conditions are met:
  7 *     * Redistributions of source code must retain the above copyright
  8 *       notice, this list of conditions and the following disclaimer.
  9 *     * Redistributions in binary form must reproduce the above copyright
 10 *       notice, this list of conditions and the following disclaimer in the
 11 *       documentation and/or other materials provided with the distribution.
 12 *     * Neither the name of the Chris Culy nor the 
 13 *		names of its contributors may be used to endorse or promote 
 14 *		products from this software without specific prior written permission.
 15 *
 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy
 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy
 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 */
 27 
 28 "use strict";
 29 /**
 30  @namespace textmodel
 31  All of the functionality is in the textmodel namespace
 32 */
 33 var textmodel = textmodel || {};
 34 
 35 (function(){
 36 //split string on spaces (assume already tokenized/analyzed, and that caseSensitive is only for token, which is in first position)
 37  /**
 38   * @class textmodel.TextHash
 39   *  This is the class for the TextHash document model.
 40   * <p>
 41   *  It is used in the samples as a way of providing information to the {@doubletree.DoubleTree} to create its {@doubletree.Trie} data structure.
 42   *  It is not a <em>necessary</em> component of DoubleTree — all that really matters is that the relevant {@doubletree.Trie}s get created.
 43   *  For example, instead of TextHash, a database query might provide the input to {@doubletree.Trie}.
 44   *  <p>
 45   *  The TextHash maintains a hash of the data items where the keys are based on the distinguishing fields. The data parameters, except for baseField, 
 46   *  should be the same as those used in the DoubleTree that will visualize the data.
 47   *  <p>
 48   *  "Items" are types, which are associated with token ids. Some functions expect an full item as an argument. Others expect a key as an argument. {@link #itemToIndex} converts a full item to a key.
 49   *  <p>
 50   *  @property {number} numTypes The number of types (readOnly)
 51   *  @property {number} numTokens The number of tokens (readOnly)
 52   *
 53   * @param string the input string, where each item is separated by whitespace.
 54   * @param caseSensitive is the comparison of the baseField case sensitive or not
 55   * @param fldNames the names of the fields in the data items
 56   * @param fldDelim the field delimter in the data items. Note: it cannot be a whitespace (e.g. tab), since whitespace is used to delimit items
 57   * @param distinguishingFldsArray the fields that determine identity
 58   * @param baseField the primary field for comparison and display (typically token or lemma, but also possibly part of speech)
 59   * @param useRecords blank lines are treated as delimiting units (records) in the text. Default is false
 60   */
 61 textmodel.TextHash = function(string, caseSensitive, fldNames, fldDelim, distinguishingFldsArray, baseField, useRecords) {
 62     var that = this;
 63     var kRecDelim = String.fromCharCode(30);
 64     this.useRecords = useRecords && true;
 65     
 66     this.baseField = baseField;
 67     this.baseFieldIdx = fldNames.indexOf(baseField);
 68     if (this.baseFieldIdx == -1) {
 69 	this.baseFieldIdx = 0;
 70     }
 71     if (this.useRecords) {
 72         string = string.replace(/\s*\n\n+/g, " " + kRecDelim + " ");  //i.e. treat kRecDelim as a token
 73     }
 74     this.items = string.replace(/\s*\n\s*/g," ").trim().split(" "); //array of the items (tokens)
 75     
 76     this.itemObjs = []; //object representations of the items. For now duplicating items, but maybe we can combine them
 77     this.indices = {}; //for each type, a list of its token positions
 78     
 79     this.numTypes = 0; //filled in below
 80     this.numTokens = this.items.length;
 81     
 82     var numItems = this.items.length;
 83     var lastItem = numItems-1;
 84     if (string != "") { //string is empty if we will load from JSON
 85 	for(var i=0;i<numItems;i++) {
 86 	    
 87 	    var item = convertItem(this.items[i]);
 88          this.itemObjs[i] = itemToObject(item);
 89 	    
 90 	    if (! (item in this.indices && this.indices[item] instanceof Array)) { //we need the instanceof Array so we can override Object properties -- hope that none are already arrays
 91 		this.indices[item] = [];
 92 	    }
 93 	    try {
 94 		this.indices[item].push(i);
 95 	    } catch (e) {
 96 		console.log("Couldn't add: " + item);
 97 	    }
 98 	    
 99 	}
100     }
101     
102     this.numTypes = Object.keys( this.indices ).length;
103     
104     //this expects a full form
105     /**
106      * is the item in the model
107      * @param item a full item (not a key)
108      * @returns true if the item is in the model, false otherwise
109      */
110     this.containsItem = function(item) {
111 	return convertItem(item) in this.indices;
112     }
113     
114     //this expects an index form
115     /**
116      * is the item key in the model
117      * @param item a key (not a full item)
118      * @returns true if the item key is in the model, false otherwise
119      */
120     this.containsIndex = function(item) {
121 	return item in this.indices;
122     }
123        
124     //includeOnly is optional object with keys of lines *in this result* to include -- used for filtering after have results
125     //TBD: use (d3.)set instead of this dumb hash
126     
127     //return array of [array of prefixes, array of item, array ofsuffixes], where prefixes and suffixes are of length contextLen; exact match only for now
128     
129     //this expects an index form
130     /**
131      * get the information associated with an item
132      * <p>
133      * The information is an array of preceding items, an array of matching items, and an array of following items,
134      * where the preceding and following items are of length contextLen.
135      * @param item a key (not a full item)
136      * @param contextLen the length of the preceding and following context to be returned
137      * @param includeOnly an object where the keys are the item ids to be included (optional)
138      * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item
139      * @param contextFilters an object with "include" OR "exclude" which have objects whose keys are fields and whose values are arrays of values of those fields e.g. {"include":{"POS":["NN","NNS"]}} would include in the context only those items whose POS is NN or NNS.
140      * Similarly, "leftEnd" and "rtEnd" (both are possible together) indicate properties determining the left and right end points of the context, excluding those elements. So {"leftEnd":{"POS":["SENT"]}, "rtEnd":{"POS":["MD"]}} would include in the left context elements up to but not including the first SENT POS, and in the right context elements up to but not including the first MD POS.
141      * @param maxRandomHits how many random hits to return. -1 or null to return all
142      * @param puncToExclude a string of punctuation to exclude from the base field (will override any punctuation allowed via "include" in contextFilters). Default is null (i.e. include all punctuation)
143      * @returns array of [array of prefixes, array of item, array of suffixes, array of ids]
144      */
145     this.getItem = function(item, contextLen, includeOnly, itemIsRegex, contextFilters, maxRandomHits, puncToExclude) {
146         var prefixArray = [], itemArray = [], suffixArray = [], idArray = [];
147        
148        if (that.useRecords) {
149             var field1 = fldNames[0];
150             
151             if (contextFilters == null) {
152                 contextFilters = {};
153             }
154             if (typeof contextFilters["leftEnd"] === "undefined") {
155                 contextFilters["leftEnd"] = {};
156             }
157             var tmp = contextFilters["leftEnd"][field1];
158             if (typeof contextFilters["leftEnd"][field1] === "undefined") {
159                 contextFilters["leftEnd"][field1] = [];
160             }
161             contextFilters["leftEnd"][field1].push(kRecDelim);
162             
163             if (typeof contextFilters["rtEnd"] === "undefined") {
164                 contextFilters["rtEnd"] = {};
165             }
166             if (typeof contextFilters["rtEnd"][field1] === "undefined") {
167                 contextFilters["rtEnd"][field1] = [];
168             }
169             contextFilters["rtEnd"][field1].push(kRecDelim);
170             
171             
172        }
173        
174         var filterPunc = function(startIndex) {
175             if (!puncToExclude) {
176                 return function() {return true;}; //i.e. no filtering
177             }
178             
179             var re = new RegExp("^[" + puncToExclude + "]$");
180             
181             return function(elt,idx,contextArray) {
182                 return ! re.test(that.itemObjs[idx+startIndex][that.baseField]);
183             }
184         };
185         
186         var filterContext = function(startIndex){
187             return function() {return true;}
188         }; //i.e. no filtering
189         var chopLeftEnd = function(a) {
190             return a;
191         }; //i.e. no filtering
192          var chopRtEnd = function(a) {
193             return a;
194         }; //i.e. no filtering
195         
196         if (contextFilters) {            
197             if (contextFilters["include"]) {
198                 var toInclude = contextFilters["include"];
199                 var filterContext = function(startIndex) { //don't really need the var here, since have it above, but jsdoc thought it was global
200                     return function(elt,idx,contextArray) {
201                         for (var fld in toInclude) {
202                             if (toInclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
203                                 return true;
204                             }
205                         }
206                         return false;
207                     }
208                 };
209                 
210             } else if (contextFilters["exclude"]) {
211                 var toExclude = contextFilters["exclude"];
212                 filterContext = function(startIndex) {
213                     return function(elt,idx,contextArray) {
214                         for (var fld in toExclude) {
215                             //var tmp = that.itemObjs[idx+startIndex];
216                             if (toExclude[fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
217                                 return false;
218                             }
219                         }
220                         return true;
221                     }
222                 };
223             }
224         
225             if (contextFilters["leftEnd"]) {
226                 chopLeftEnd = function(a, startIndex) {
227                     var where = 0;
228                     var OK = true;
229                     for (var idx=a.length-1;idx>-1;idx--) {
230                         for (var fld in contextFilters["leftEnd"]) {
231                             if (contextFilters["leftEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
232                                 OK = false;
233                             }
234                         }
235                         if (!OK) {
236                             where = idx +1;
237                             break;
238                         }
239                     }
240                     return a.slice(where,a.length+1);
241                 }
242             }
243             
244            
245             if (contextFilters["rtEnd"]) {
246                 chopRtEnd = function(a, startIndex) {
247                     
248                     var OK = true;
249                     return a.filter(function(elt,idx) {
250                         if (!OK) {
251                             return false;
252                         }
253                         for (var fld in contextFilters["rtEnd"]) {
254                             if (contextFilters["rtEnd"][fld].indexOf(that.itemObjs[idx+startIndex][fld]) > -1) {
255                                 OK = false;
256                                 break;
257                             }
258                         }
259                         return OK;
260                     });
261                 }
262             }
263         }
264        
265         var numItems = this.items.length;
266         var lastItem = numItems-1;
267         
268         var hits;
269         if (itemIsRegex) {
270             var thisThat = this;
271             var re = new RegExp(item,'i'); //TBD 'i' is probably not right ...
272             var hitArrays = Object.keys(this.indices).filter(function(item){
273              return re.test(item);
274               })
275             .map(function(hit) {
276              return thisThat.indices[hit];
277             })
278             //.flatten(); //TBD use this if we can get it from a library, like Underscore.js
279             
280             hits = [];
281             hitArrays.forEach(function(arr){
282              arr.forEach(function(elt){
283                  hits.push(elt);    
284              });
285             });
286             
287             
288             thisThat = null;
289         } else {
290             hits = this.indices[item];
291         }
292         
293         
294         
295         var n = hits.length;
296         var nMinus1 = n - 1;
297 	
298         for (var i = 0; i < n; i++) {
299             if (includeOnly != null && ! (i in includeOnly) ) {
300                 continue;
301             }
302 
303             idArray.push(i);
304             var thisIndex = hits[i];
305             var origItem = this.items[thisIndex];
306             itemArray.push( origItem );
307 	    
308             if (thisIndex == 0) {
309                 prefixArray.push( [] );
310             } else {
311                 var pStart = Math.max(0, thisIndex - contextLen); //start of prefix
312                 var prefs = chopLeftEnd(this.items.slice(pStart, thisIndex), pStart);
313                 pStart += contextLen - prefs.length;
314                 prefixArray.push( prefs.filter(function(elt,idx,contextArray) {
315                     return filterPunc(pStart)(elt,idx,contextArray) && filterContext(pStart)(elt,idx,contextArray);
316                 }) );
317             }
318 
319             if (thisIndex == lastItem) {
320                 suffixArray.push( [] );
321             } else {
322                 var sEnd = Math.min(lastItem, thisIndex + contextLen);
323                 var sStart = thisIndex + 1;
324                 var suffs = chopRtEnd(this.items.slice(sStart, sEnd + 1), sStart);
325                 suffixArray.push( suffs.filter(function(elt,idx,contextArray) {
326                     return filterPunc(sStart)(elt,idx,contextArray) && filterContext(sStart)(elt,idx,contextArray);
327                 }) );
328             }
329         }
330         
331         //select the number of random hits
332         if (maxRandomHits !== null && maxRandomHits > 0) {
333             //create an array 0..n, shuffle it, then take the first maxRandomHits
334             var indices = d3.shuffle(d3.range(0,itemArray.length)).slice(0,maxRandomHits);
335             
336             //now permute our originals
337             prefixArray = d3.permute(prefixArray, indices);
338             itemArray = d3.permute(itemArray, indices);
339             suffixArray = d3.permute(suffixArray, indices);
340             idArray = d3.permute(idArray, indices);
341         }
342         
343         return [prefixArray, itemArray, suffixArray, idArray];
344     }
345     
346     //this expects a regular expression to match against index forms
347     /**
348      * Convenience function for textmodel.TextHash#getItem with isRegex=true
349      */
350     this.getItems = function(regex, contextLen, includeOnly, contextFilters, maxRandomHits, puncToExclude) {
351         return this.getItem(regex, contextLen, includeOnly, true, contextFilters, maxRandomHits, puncToExclude);
352     }
353     
354     //this expects an index form
355     //return string of context around single hit
356     /**
357      * get a string of context around a single hit
358      * @param item the item key (not a full form)
359      * @param contextLen the length of the preceding and following context to include
360      * @param id the id of the hit to return
361      * @param itemIsRegex true if the item parameter should be considered as a regular expression instead of as a true item (This should match the value in the original query)
362      * @returns string of context around the item with id, including the item itself
363      */
364     this.getItemContext = function(item, contextLen, id, itemIsRegex) {
365 	//item = convertItem(item);
366         var toGet = {};
367 	toGet[id] = true;
368         var results = this.getItem(item,contextLen,toGet, itemIsRegex);
369         
370         var what = results[0][0].join(" ") + " " + results[1][0] + " " + results[2][0].join(" ");
371         return what;
372     }
373     
374     //TBD ?? factor out sort case-insensitive sort, since it occurs twice
375     
376     /**
377      * get the unique item keys in the model
378      * @returns a sorted (case insensitive) array of item keys
379      */
380     this.getUniqItems = function() {
381         return Object.keys( this.indices ).sort(function(A,B){
382             var a = A.toLocaleLowerCase();
383             var b = B.toLocaleLowerCase();
384             if (a<b) return -1;
385             if (a>b) return 1;
386             return 0;
387         }).filter(function(i){return i !== kRecDelim});
388     }
389     
390     //append \t and the count to the item
391     /**
392      * get the unique item keys in the model, each followed by tab and its token count
393      * @returns a sorted (case insensitive) array of item keys with their token counts
394      */
395     this.getUniqItemsWithCounts = function() {
396         var what = [];
397         for(item in this.indices ) {
398             what.push(item + "\t" + this.indices[item].length);
399         }
400         
401         what.sort(function(A,B){
402             var a = A.toLocaleLowerCase();
403             var b = B.toLocaleLowerCase();
404             if (a<b) return -1;
405             if (a>b) return 1;
406             return 0;
407         });
408         
409         return what;
410     }
411     
412     /**
413      * make this TextHash have the values of a (previously saved) TextHash JSON object
414      * @param obj the TextHash JSON object
415      */
416     this.fromJSON = function(obj) {
417 	
418 	this.baseField = obj.baseField;
419 	this.baseFieldIdx = obj.baseFieldIdx;
420 	this.items = obj.items;
421 	this.indices = obj.indices;
422 	this.numTypes = obj.numTypes;
423 	this.numTokens = obj.numTokens;
424      this.useRecords = obj.useRecords;
425     }
426     
427     //returns the index form of item, e.g. for when resetting predictive UI
428     /**
429      * convert a full item to its key form
430      * @param item the full item
431      * @returns the key form of the item
432      */
433     this.itemToIndex = function(item) {
434 	return convertItem(item);
435     }
436     
437     /** @private */
438     function convertItem(inItem) {
439 	
440 	var flds = inItem.split(fldDelim);
441 	var item = flds[that.baseFieldIdx];
442 	if (! caseSensitive) {
443 	    item = item.toLocaleLowerCase();
444 	}
445 	
446         for(var f=0,n=flds.length;f<n;f++) {
447             if (f == that.baseFieldIdx) {
448                 continue;
449             }
450             if ( distinguishingFldsArray.indexOf( fldNames[f] ) > -1) {
451                 item += ("" + fldDelim + flds[f]);
452             }
453         }
454 	return item;
455     }
456     
457      /** @private */
458     function itemToObject(inItem) { //we need to define this early, since it gets used in the constructor
459         var flds = inItem.split(fldDelim);
460         var what = {};
461         flds.forEach(function(fld, idx){
462             what[ fldNames[idx] ] = fld;
463         });
464         return what;
465     }
466     
467 }
468 })();
469