1 /* (This is the new BSD license.) 2 * Copyright (c) 2014, Chris Culy 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * * Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * * Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * * Neither the name of the Chris Culy nor the 13 * names of its contributors may be used to endorse or promote 14 * products from this software without specific prior written permission. 15 * 16 * THIS SOFTWARE IS PROVIDED BY Chris Culy 17 * ``AS IS'' AND ANY OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 18 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL Chris Culy 20 * BE LIABLE FOR ANY, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 22 * GOODS OR SERVICES; OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 * CAUSED AND ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR 24 * TORT INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 "use strict"; 29 /** 30 @namespace textmodel 31 All of the functionality is in the textmodel namespace 32 */ 33 var textmodel = textmodel || {}; 34 35 (function(){ 36 //split string on spaces (assume already tokenized/analyzed) 37 /** 38 * @class textmodel.RegExTextModel 39 * This is the class for the RegExTextModel document model. 40 * <p> 41 * 42 */ 43 textmodel.RegExTextModel = function(string) { 44 var that = this; 45 var s = string; 46 47 var aveItemLen = 20; //will use to look for context 48 49 50 var initialWhitespace = /^\s/; 51 var finalWhitespace = /\s$/; 52 var puncMarker = "߷"; //NKO SYMBOL GBAKURUNEN U+07F7 placeholder for string of punctuation marks 53 54 55 /** 56 * @param re is the search expression. Only the <em>i</em> and <em>m</em> flags may be used. g may not. 57 * @param contextLen the length of the preceding and following context to be returned 58 * @param maxRandomHits how many random hits to return. -1 or null to return all 59 * @puncToExclude is the punctuation (as a string) to exclude when searching: any instances of puncMarker in re will be replaced by puncToExclude 60 * @returns array of [array of prefixes, array of item, array of suffixes, array of ids], 61 * where prefixes and suffixes come from splitting on whitespace, and the item is extended in both directions to the nearest whitespace 62 */ 63 this.getItem = function(re, contextLen, maxRandomHits, puncToExclude) { 64 var prefixArray = [], itemArray = [], suffixArray = [], idArray = []; 65 66 if (typeof(re.source) === 'undefined' || re.source === "") { 67 return [prefixArray, itemArray, suffixArray, idArray]; 68 } 69 70 var flags = "g"; 71 if (re.ignoreCase) { 72 flags += "i"; 73 } 74 if (re.multiline) { 75 flags += "m"; 76 } 77 78 79 var rePunc = new RegExp(puncMarker, "g"); 80 var reSrc = re.source.replace(rePunc, puncToExclude); 81 82 var regex = new RegExp(reSrc, flags); 83 84 85 //based on https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/exec 86 var results = []; 87 var oneResult = []; 88 while ((oneResult = regex.exec(s)) !== null) { 89 var result = {'found' : oneResult[0]}; 90 result.start = oneResult.index; 91 result.end = regex.lastIndex; //this is index just past the end of the match 92 results.push(result); 93 } 94 95 //select the number of random hits 96 if (maxRandomHits !== null && maxRandomHits > 0) { 97 //create an array 0..n, shuffle it, then take the first maxRandomHits 98 var indices = d3.shuffle(d3.range(0,results.length)).slice(0,maxRandomHits); 99 100 //now permute our original 101 results = d3.permute(results, indices); 102 } 103 104 //now get the contexts and fill our arrays 105 106 107 for (var i=0, n=results.length;i<n;i++) { 108 var r = results[i]; 109 110 var hit = r.found; 111 112 //if the hit is internal to an item, then we need to expand it to the nearest whitespace, in either direction 113 114 var expandLeft = (! initialWhitespace.test(hit)); 115 if (expandLeft && initialWhitespace.test(s.substring(r.start-1,r.start))) { 116 expandLeft = false; 117 } 118 var expandRt = (! finalWhitespace.test(hit)) 119 if (expandRt && initialWhitespace.test(s.substring(r.end,r.end+1))) { 120 expandRt = false; 121 } 122 123 124 var thisContextLen = contextLen; 125 if (expandLeft || expandRt) { 126 thisContextLen += 1; 127 } 128 var padding = (thisContextLen+2)*aveItemLen; 129 130 var pieces = s.substring( r.start-padding-1, r.start ).split(/\s+/); 131 if (pieces[pieces.length-1] === '') { 132 pieces.pop(); 133 } 134 if (expandLeft && pieces.length > 0) { 135 hit = pieces.pop() + hit; 136 } 137 prefixArray[i] = pieces.slice(-contextLen); 138 139 pieces = s.substring( r.end, r.end+padding ).split(/\s+/); 140 if (pieces[0] === '') { 141 pieces.shift(); 142 } 143 if (expandRt && pieces.length > 0) { 144 hit += pieces.shift(); 145 } 146 suffixArray[i] = pieces.slice(0,contextLen); 147 148 149 itemArray[i] = hit; 150 idArray[i] = r.start; 151 152 } 153 154 return [prefixArray, itemArray, suffixArray, idArray]; 155 } 156 157 /** 158 * Get a previously calculated item at the index along with some context on either side 159 * @param re is a the regular expression used previsousy by {@link textmodel.RegExTextModel.KWIC#getItem} to calculate the index 160 * @param index is an index as calculated by {@link textmodel.RegExTextModel.KWIC#getItem} 161 * @param contextLen is the length (in characters) of the context 162 * @returns an object {item:String, left:String, right:String} 163 */ 164 this.getItemAndContext = function(re, index, contextLen) { 165 index = 1*index; 166 var what = {}; 167 var flags = ""; 168 if (re.ignoreCase) { 169 flags += "i"; 170 } 171 if (re.multiline) { 172 flags += "m"; 173 } 174 var regex = new RegExp(re.source, flags); 175 176 var r = regex.exec(s.substr(index)); 177 178 179 180 if (! r[0]) { 181 return null; //error 182 } 183 184 what.item = r[0]; 185 var beginL = index-contextLen; 186 var realLen = contextLen; 187 if (index < contextLen) { 188 beginL = 0; 189 realLen = index; 190 } 191 what.left = s.substr(beginL, realLen); 192 what.right = s.substr(index + what.item.length, contextLen) 193 194 return what; 195 } 196 197 /** 198 * Getter/setter for the punctuation symbol 199 * @param val is the symbol to use as a placeholder for a stirng of punctuation symbols. See {@link textmodel.RegExTextModel.KWIC#getItem}. 200 */ 201 this.puncMarker = function(val) { 202 if (arguments.length === 0) { 203 return puncMaker; 204 } 205 puncMarker = val; 206 return this; 207 } 208 209 } 210 })(); 211