1 /* (This is the new BSD license.)
  2 * Copyright (c) 2014, Chris Culy
  3 * All rights reserved.
  4 *
  5 * Redistribution and use in source and binary forms, with or without
  6 * modification, are permitted provided that the following conditions are met:
  7 *     * Redistributions of source code must retain the above copyright
  8 *       notice, this list of conditions and the following disclaimer.
  9 *     * Redistributions in binary form must reproduce the above copyright
 10 *       notice, this list of conditions and the following disclaimer in the
 11 *       documentation and/or other materials provided with the distribution.
 12 *     * Neither the name of the Chris Culy nor the 
 13 *		names of its contributors may be used to endorse or promote 
 14 *		products from this software without specific prior written permission.
 15 *
 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy
 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy
 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 */
 27 
 28 "use strict";
 29 /**
 30  @namespace textmodel
 31  All of the functionality is in the textmodel namespace
 32 */
 33 var textmodel = textmodel || {};
 34 
 35 (function(){
 36 //split string on spaces (assume already tokenized/analyzed)
 37  /**
 38   * @class textmodel.RegExTextModel
 39   *  This is the class for the RegExTextModel document model.
 40   * <p>
 41   *  
 42   */
 43 textmodel.RegExTextModel = function(string) {
 44     var that = this;
 45     var s = string;
 46     
 47     var aveItemLen = 20; //will use to look for context
 48     
 49     
 50     var initialWhitespace = /^\s/;
 51     var finalWhitespace = /\s$/;
 52     
 53     /**
 54      * @param re is the search expression. Only the <em>i</em> and <em>m</em> flags may be used. g may not.
 55      * @param contextLen the length of the preceding and following context to be returned
 56      * @param maxRandomHits how many random hits to return. -1 or null to return all
 57      * @returns array of [array of prefixes, array of item, array of suffixes, array of ids],
 58      * where prefixes and suffixes come from splitting on whitespace, and the item is extended in both directions to the nearest whitespace
 59      */
 60     this.getItem = function(re, contextLen, maxRandomHits) {
 61         var prefixArray = [], itemArray = [], suffixArray = [], idArray = [];
 62        
 63         if (typeof(re.source) === 'undefined' || re.source === "") {
 64             return [prefixArray, itemArray, suffixArray, idArray];
 65         }
 66        
 67         var flags = "g";
 68         if (re.ignoreCase) {
 69             flags += "i";
 70         }
 71         if (re.multiline) {
 72             flags += "m";
 73         }
 74         var regex = new RegExp(re.source, flags);
 75         
 76         
 77         //based on https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec
 78         var results = [];
 79         var oneResult = [];
 80         while ((oneResult = regex.exec(s)) !== null) {
 81             var result = {'found' : oneResult[0]};
 82             result.start = oneResult.index;
 83             result.end = regex.lastIndex; //this is index just past the end of the match
 84             results.push(result);
 85         }
 86         
 87         //select the number of random hits
 88         if (maxRandomHits !== null && maxRandomHits > 0) {
 89             //create an array 0..n, shuffle it, then take the first maxRandomHits
 90             var indices = d3.shuffle(d3.range(0,results.length)).slice(0,maxRandomHits);
 91             
 92             //now permute our original
 93             results = d3.permute(results, indices);
 94         }
 95         
 96         //now get the contexts and fill our arrays
 97         
 98         
 99         for (var i=0, n=results.length;i<n;i++) {
100             var r = results[i];
101             
102             var hit = r.found;
103             
104             //if the hit is internal to an item, then we need to expand it to the nearest whitespace, in either direction
105             
106             var expandLeft = (! initialWhitespace.test(hit));
107             if (expandLeft && initialWhitespace.test(s.substring(r.start-1,r.start))) {
108                 expandLeft = false;
109             }
110             var expandRt = (! finalWhitespace.test(hit))
111             if (expandRt && initialWhitespace.test(s.substring(r.end,r.end+1))) {
112                 expandRt = false;
113             }
114             
115             
116             var thisContextLen = contextLen;
117             if (expandLeft || expandRt) {
118                 thisContextLen += 1;
119             }
120             var padding = (thisContextLen+2)*aveItemLen;
121             
122             var pieces = s.substring( r.start-padding-1, r.start ).split(/\s+/);
123             if (pieces[pieces.length-1] === '') {
124                 pieces.pop();
125             }
126             if (expandLeft && pieces.length > 0) {
127               hit = pieces.pop() + hit;
128             }
129             prefixArray[i] = pieces.slice(-contextLen);
130             
131             pieces = s.substring( r.end, r.end+padding ).split(/\s+/);
132             if (pieces[0] === '') {
133                 pieces.shift();
134             }
135             if (expandRt && pieces.length > 0) {
136               hit += pieces.shift();
137             }
138             suffixArray[i] = pieces.slice(0,contextLen);
139             
140             
141             itemArray[i] = hit;
142             idArray[i] = r.start;
143             
144         }
145         
146         return [prefixArray, itemArray, suffixArray, idArray];
147     }
148     
149     /**
150      * Get a previously calculated item at the index along with some context on either side
151      * @param re is a the regular expression used previsousy by {@link textmodel.RegExTextModel.KWIC#getItem} to calculate the index
152      * @param index is an index as calculated by {@link textmodel.RegExTextModel.KWIC#getItem}
153      * @param contextLen is the length (in characters) of the context
154      * @returns an object  {item:String, left:String, right:String}
155      */
156     this.getItemAndContext = function(re, index, contextLen) {
157         index = 1*index;
158         var what = {};
159         var flags = "";
160         if (re.ignoreCase) {
161             flags += "i";
162         }
163         if (re.multiline) {
164             flags += "m";
165         }
166         var regex = new RegExp(re.source, flags);
167         
168         var r = regex.exec(s.substr(index));
169         
170         
171         
172         if (! r[0]) {
173             return null; //error
174         }
175         
176         what.item = r[0];
177         var beginL = index-contextLen;
178         var realLen = contextLen;
179         if (index < contextLen) {
180             beginL = 0;
181             realLen = index;
182         }
183         what.left = s.substr(beginL, realLen);
184         what.right = s.substr(index + what.item.length, contextLen)
185         
186         return what;
187     }
188     
189 }
190 })();
191