1 /* (This is the new BSD license.)
  2 * Copyright (c) 2014, Chris Culy
  3 * All rights reserved.
  4 *
  5 * Redistribution and use in source and binary forms, with or without
  6 * modification, are permitted provided that the following conditions are met:
  7 *     * Redistributions of source code must retain the above copyright
  8 *       notice, this list of conditions and the following disclaimer.
  9 *     * Redistributions in binary form must reproduce the above copyright
 10 *       notice, this list of conditions and the following disclaimer in the
 11 *       documentation and/or other materials provided with the distribution.
 12 *     * Neither the name of the Chris Culy nor the 
 13 *		names of its contributors may be used to endorse or promote 
 14 *		products from this software without specific prior written permission.
 15 *
 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy
 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 
 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy
 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 
 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 
 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 
 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 
 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 */
 27 
 28 "use strict";
 29 /**
 30  @namespace textmodel
 31  All of the functionality is in the textmodel namespace
 32 */
 33 var textmodel = textmodel || {};
 34 
 35 (function(){
 36 //split string on spaces (assume already tokenized/analyzed)
 37  /**
 38   * @class textmodel.RegExTextModel
 39   *  This is the class for the RegExTextModel document model.
 40   * <p>
 41   *  
 42   */
 43 textmodel.RegExTextModel = function(string) {
 44     var that = this;
 45     var s = string;
 46     
 47     var aveItemLen = 20; //will use to look for context
 48     
 49     
 50     var initialWhitespace = /^\s/;
 51     var finalWhitespace = /\s$/;
 52     var puncMarker = "߷"; //NKO SYMBOL GBAKURUNEN U+07F7    placeholder for string of punctuation marks
 53     
 54     
 55     /**
 56      * @param re is the search expression. Only the <em>i</em> and <em>m</em> flags may be used. g may not.
 57      * @param contextLen the length of the preceding and following context to be returned
 58      * @param maxRandomHits how many random hits to return. -1 or null to return all
 59      * @puncToExclude is the punctuation (as a string) to exclude when searching: any instances of puncMarker in re will be replaced by puncToExclude
 60      * @returns array of [array of prefixes, array of item, array of suffixes, array of ids],
 61      * where prefixes and suffixes come from splitting on whitespace, and the item is extended in both directions to the nearest whitespace
 62      */
 63     this.getItem = function(re, contextLen, maxRandomHits, puncToExclude) {
 64         var prefixArray = [], itemArray = [], suffixArray = [], idArray = [];
 65        
 66         if (typeof(re.source) === 'undefined' || re.source === "") {
 67             return [prefixArray, itemArray, suffixArray, idArray];
 68         }
 69        
 70         var flags = "g";
 71         if (re.ignoreCase) {
 72             flags += "i";
 73         }
 74         if (re.multiline) {
 75             flags += "m";
 76         }
 77         
 78        
 79         var rePunc = new RegExp(puncMarker, "g");
 80         var reSrc = re.source.replace(rePunc, puncToExclude);
 81         
 82         var regex = new RegExp(reSrc, flags);
 83         
 84         
 85         //based on https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec
 86         var results = [];
 87         var oneResult = [];
 88         while ((oneResult = regex.exec(s)) !== null) {
 89             var result = {'found' : oneResult[0]};
 90             result.start = oneResult.index;
 91             result.end = regex.lastIndex; //this is index just past the end of the match
 92             results.push(result);
 93         }
 94         
 95         //select the number of random hits
 96         if (maxRandomHits !== null && maxRandomHits > 0) {
 97             //create an array 0..n, shuffle it, then take the first maxRandomHits
 98             var indices = d3.shuffle(d3.range(0,results.length)).slice(0,maxRandomHits);
 99             
100             //now permute our original
101             results = d3.permute(results, indices);
102         }
103         
104         //now get the contexts and fill our arrays
105         
106         
107         for (var i=0, n=results.length;i<n;i++) {
108             var r = results[i];
109             
110             var hit = r.found;
111             
112             //if the hit is internal to an item, then we need to expand it to the nearest whitespace, in either direction
113             
114             var expandLeft = (! initialWhitespace.test(hit));
115             if (expandLeft && initialWhitespace.test(s.substring(r.start-1,r.start))) {
116                 expandLeft = false;
117             }
118             var expandRt = (! finalWhitespace.test(hit))
119             if (expandRt && initialWhitespace.test(s.substring(r.end,r.end+1))) {
120                 expandRt = false;
121             }
122             
123             
124             var thisContextLen = contextLen;
125             if (expandLeft || expandRt) {
126                 thisContextLen += 1;
127             }
128             var padding = (thisContextLen+2)*aveItemLen;
129             
130             var pieces = s.substring( r.start-padding-1, r.start ).split(/\s+/);
131             if (pieces[pieces.length-1] === '') {
132                 pieces.pop();
133             }
134             if (expandLeft && pieces.length > 0) {
135               hit = pieces.pop() + hit;
136             }
137             prefixArray[i] = pieces.slice(-contextLen);
138             
139             pieces = s.substring( r.end, r.end+padding ).split(/\s+/);
140             if (pieces[0] === '') {
141                 pieces.shift();
142             }
143             if (expandRt && pieces.length > 0) {
144               hit += pieces.shift();
145             }
146             suffixArray[i] = pieces.slice(0,contextLen);
147             
148             
149             itemArray[i] = hit;
150             idArray[i] = r.start;
151             
152         }
153         
154         return [prefixArray, itemArray, suffixArray, idArray];
155     }
156     
157     /**
158      * Get a previously calculated item at the index along with some context on either side
159      * @param re is a the regular expression used previsousy by {@link textmodel.RegExTextModel.KWIC#getItem} to calculate the index
160      * @param index is an index as calculated by {@link textmodel.RegExTextModel.KWIC#getItem}
161      * @param contextLen is the length (in characters) of the context
162      * @returns an object  {item:String, left:String, right:String}
163      */
164     this.getItemAndContext = function(re, index, contextLen) {
165         index = 1*index;
166         var what = {};
167         var flags = "";
168         if (re.ignoreCase) {
169             flags += "i";
170         }
171         if (re.multiline) {
172             flags += "m";
173         }
174         var regex = new RegExp(re.source, flags);
175         
176         var r = regex.exec(s.substr(index));
177         
178         
179         
180         if (! r[0]) {
181             return null; //error
182         }
183         
184         what.item = r[0];
185         var beginL = index-contextLen;
186         var realLen = contextLen;
187         if (index < contextLen) {
188             beginL = 0;
189             realLen = index;
190         }
191         what.left = s.substr(beginL, realLen);
192         what.right = s.substr(index + what.item.length, contextLen)
193         
194         return what;
195     }
196     
197     /**
198      * Getter/setter for the punctuation symbol
199      * @param val is the symbol to use as a placeholder for a stirng of punctuation symbols. See {@link textmodel.RegExTextModel.KWIC#getItem}.
200      */
201     this.puncMarker = function(val) {
202         if (arguments.length === 0) {
203             return puncMaker;
204         }
205         puncMarker = val;
206         return this;
207     }
208     
209 }
210 })();
211