1 /* (This is the new BSD license.) 2 * Copyright (c) 2014, Chris Culy 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the Chris Culy nor the 13 * names of its contributors may be used to endorse or promote 14 * products from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 "use strict"; 29 /** 30 @namespace textmodel 31 All of the functionality is in the textmodel namespace 32 */ 33 var textmodel = textmodel || {}; 34 35 (function(){ 36 //split string on spaces (assume already tokenized/analyzed) 37 /** 38 * @class textmodel.RegExTextModel 39 * This is the class for the RegExTextModel document model. 40 * <p> 41 * 42 */ 43 textmodel.RegExTextModel = function(string) { 44 var that = this; 45 var s = string; 46 47 var aveItemLen = 20; //will use to look for context 48 49 50 var initialWhitespace = /^\s/; 51 var finalWhitespace = /\s$/; 52 53 /** 54 * @param re is the search expression. Only the <em>i</em> and <em>m</em> flags may be used. g may not. 55 * @param contextLen the length of the preceding and following context to be returned 56 * @param maxRandomHits how many random hits to return. -1 or null to return all 57 * @returns array of [array of prefixes, array of item, array of suffixes, array of ids], 58 * where prefixes and suffixes come from splitting on whitespace, and the item is extended in both directions to the nearest whitespace 59 */ 60 this.getItem = function(re, contextLen, maxRandomHits) { 61 var prefixArray = [], itemArray = [], suffixArray = [], idArray = []; 62 63 if (typeof(re.source) === 'undefined' || re.source === "") { 64 return [prefixArray, itemArray, suffixArray, idArray]; 65 } 66 67 var flags = "g"; 68 if (re.ignoreCase) { 69 flags += "i"; 70 } 71 if (re.multiline) { 72 flags += "m"; 73 } 74 var regex = new RegExp(re.source, flags); 75 76 77 //based on https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec 78 var results = []; 79 var oneResult = []; 80 while ((oneResult = regex.exec(s)) !== null) { 81 var result = {'found' : oneResult[0]}; 82 result.start = oneResult.index; 83 result.end = regex.lastIndex; //this is index just past the end of the match 84 results.push(result); 85 } 86 87 //select the number of random hits 88 if (maxRandomHits !== null && maxRandomHits > 0) { 89 //create an array 0..n, shuffle it, then take the first maxRandomHits 90 var indices = d3.shuffle(d3.range(0,results.length)).slice(0,maxRandomHits); 91 92 //now permute our original 93 results = d3.permute(results, indices); 94 } 95 96 //now get the contexts and fill our arrays 97 98 99 for (var i=0, n=results.length;i<n;i++) { 100 var r = results[i]; 101 102 var hit = r.found; 103 104 //if the hit is internal to an item, then we need to expand it to the nearest whitespace, in either direction 105 106 var expandLeft = (! initialWhitespace.test(hit)); 107 if (expandLeft && initialWhitespace.test(s.substring(r.start-1,r.start))) { 108 expandLeft = false; 109 } 110 var expandRt = (! finalWhitespace.test(hit)) 111 if (expandRt && initialWhitespace.test(s.substring(r.end,r.end+1))) { 112 expandRt = false; 113 } 114 115 116 var thisContextLen = contextLen; 117 if (expandLeft || expandRt) { 118 thisContextLen += 1; 119 } 120 var padding = (thisContextLen+2)*aveItemLen; 121 122 var pieces = s.substring( r.start-padding-1, r.start ).split(/\s+/); 123 if (pieces[pieces.length-1] === '') { 124 pieces.pop(); 125 } 126 if (expandLeft && pieces.length > 0) { 127 hit = pieces.pop() + hit; 128 } 129 prefixArray[i] = pieces.slice(-contextLen); 130 131 pieces = s.substring( r.end, r.end+padding ).split(/\s+/); 132 if (pieces[0] === '') { 133 pieces.shift(); 134 } 135 if (expandRt && pieces.length > 0) { 136 hit += pieces.shift(); 137 } 138 suffixArray[i] = pieces.slice(0,contextLen); 139 140 141 itemArray[i] = hit; 142 idArray[i] = r.start; 143 144 } 145 146 return [prefixArray, itemArray, suffixArray, idArray]; 147 } 148 149 /** 150 * Get a previously calculated item at the index along with some context on either side 151 * @param re is a the regular expression used previsousy by {@link textmodel.RegExTextModel.KWIC#getItem} to calculate the index 152 * @param index is an index as calculated by {@link textmodel.RegExTextModel.KWIC#getItem} 153 * @param contextLen is the length (in characters) of the context 154 * @returns an object {item:String, left:String, right:String} 155 */ 156 this.getItemAndContext = function(re, index, contextLen) { 157 index = 1*index; 158 var what = {}; 159 var flags = ""; 160 if (re.ignoreCase) { 161 flags += "i"; 162 } 163 if (re.multiline) { 164 flags += "m"; 165 } 166 var regex = new RegExp(re.source, flags); 167 168 var r = regex.exec(s.substr(index)); 169 170 171 172 if (! r[0]) { 173 return null; //error 174 } 175 176 what.item = r[0]; 177 var beginL = index-contextLen; 178 var realLen = contextLen; 179 if (index < contextLen) { 180 beginL = 0; 181 realLen = index; 182 } 183 what.left = s.substr(beginL, realLen); 184 what.right = s.substr(index + what.item.length, contextLen) 185 186 return what; 187 } 188 189 } 190 })(); 191