1 /***
2 Copyright (C) 2005 The Java Community
3
4 This program is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free Software
6 Foundation; either version 2 of the License, or (at your option) any later
7 version.
8
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 Place, Suite 330, Boston, MA 02111-1307 USA.
16 */
17 package org.bejug.javacareers.jobs.search.lucene;
18
19 /***
20 * Copyright 2002-2004 The Apache Software Foundation
21 *
22 * Licensed under the Apache License, Version 2.0 (the "License");
23 * you may not use this file except in compliance with the License.
24 * You may obtain a copy of the License at
25 *
26 * http://www.apache.org/licenses/LICENSE-2.0
27 *
28 * Unless required by applicable law or agreed to in writing, software
29 * distributed under the License is distributed on an "AS IS" BASIS,
30 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31 * See the License for the specific language governing permissions and
32 * limitations under the License.
33 */
34 import java.io.File;
35 import java.io.IOException;
36 import java.io.StringReader;
37 import java.util.ArrayList;
38 import java.util.List;
39
40 import org.apache.commons.logging.Log;
41 import org.apache.commons.logging.LogFactory;
42 import org.apache.lucene.analysis.Analyzer;
43 import org.apache.lucene.analysis.TokenStream;
44 import org.apache.lucene.analysis.standard.StandardAnalyzer;
45 import org.apache.lucene.document.Document;
46 import org.apache.lucene.index.IndexReader;
47 import org.apache.lucene.queryParser.ParseException;
48 import org.apache.lucene.queryParser.QueryParser;
49 import org.apache.lucene.search.Hits;
50 import org.apache.lucene.search.IndexSearcher;
51 import org.apache.lucene.search.Query;
52 import org.apache.lucene.search.Searcher;
53 import org.apache.lucene.search.highlight.Formatter;
54 import org.apache.lucene.search.highlight.Highlighter;
55 import org.apache.lucene.search.highlight.QueryScorer;
56 import org.apache.lucene.search.highlight.SimpleFragmenter;
57 import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
58 import org.apache.lucene.search.highlight.TokenGroup;
59
60 /***
61 * Adapted from JUnit Test for Highlighter class by mark@searcharea.co.uk
62 *
63 * @author Bavo Bruylandt (Last modified by $Author: shally $)
64 * @version $Revision: 1.7 $ - $Date: 2005/12/20 15:36:46 $
65 */
66 public class ContextSearcher implements Formatter {
67
68 private static final Log LOG = LogFactory.getLog(ContextSearcher.class);
69 private static final String FIELD_NAME = "body";
70 private static final String PATH_NAME = "path";
71 private static final String USER_NAME = "user";
72
73 private static ContextSearcher contextSearch;
74
75 private IndexReader reader;
76 private Query query;
77 private Searcher searcher;
78 private Hits hits;
79 private String indexPath;
80
81 private Analyzer analyzer = new StandardAnalyzer();
82 private String endTag = "<b>";
83 private String startTag = "</b>";
84
85
86 /***
87 * Constructor for ContextSearcher.
88 *
89 * @param path Path to file
90 */
91 private ContextSearcher(String path) {
92 File indexPathFile = new File(path);
93 indexPath = indexPathFile.getAbsolutePath() + File.separator;
94 if (!indexPathFile.exists() || !indexPathFile.isDirectory()) {
95 LOG.info("Debug: IndexPath didnt exist: "+path+ " creating indexer first...");
96 try {
97 PdfIndexer.createPdfIndexer(path);
98 LOG.info("Debug: Indexer created");
99 } catch (PdfException e) {
100 LOG.error(e);
101 }
102 }
103 }
104
105 /***
106 *
107 * @param searchString String
108 * @param contextLength int
109 * @return List
110 * @throws PdfException if an error
111 */
112 public List getContext(String searchString, int contextLength)
113 throws PdfException {
114
115 if (contextLength < 0) {
116 throw new IllegalArgumentException("contextlength < 0");
117 }
118 List list = new ArrayList();
119
120 doSearching(searchString);
121 Formatter formatter = new SimpleHTMLFormatter(startTag,endTag);
122 Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));
123
124 highlighter.setTextFragmenter(new SimpleFragmenter(contextLength));
125 int maxNumFragmentsRequired = 2;
126 StringBuffer allresults = new StringBuffer();
127 LOG.info("Debug: Searching..."+searchString+" "+contextLength);
128 for (int i = 0; i < hits.length(); i++) {
129 String text = null;
130 String path = null;
131 String user = null;
132 Document doc = null;
133 double score = -1;
134 try {
135 doc = hits.doc(i);
136 text = doc.get(FIELD_NAME);
137 path = doc.get(PATH_NAME);
138 user = doc.get(USER_NAME);
139 score = hits.score(i);
140 } catch (IOException e) {
141 LOG.debug(e);
142 }
143
144 TokenStream tokenStream = analyzer.tokenStream(
145 FIELD_NAME, new StringReader(text));
146
147
148 String result = null;
149 try {
150 result = highlighter.getBestFragments(
151 tokenStream, text, maxNumFragmentsRequired, "...");
152
153 } catch (IOException e) {
154 LOG.debug(e);
155 }
156
157 allresults.append(result);
158 SearchResult searchResult = new SearchResultImpl();
159 searchResult.addContext(result);
160 searchResult.setFile(path);
161 searchResult.setQuery(query.toString());
162 searchResult.setWeight(score);
163 searchResult.setUser(user);
164 list.add(searchResult);
165 }
166 try {
167 if (reader != null) {
168 reader.close();
169 }
170 if (searcher != null) {
171 searcher.close();
172 }
173 } catch (IOException e) {
174 LOG.debug(e);
175 }
176 LOG.info("Debug: Serach list: "+list);
177 LOG.info("Debug: Found: "+list.size()+" items");
178 return list;
179 }
180
181 /***
182 *
183 * @param originalText String
184 * @param group TokenGroup
185 * @return text string
186 */
187 public String highlightTerm(String originalText, TokenGroup group) {
188 if (group.getTotalScore() <= 0) {
189 return originalText;
190 }
191
192 return "" + originalText + "";
193 }
194
195 /***
196 *
197 * @param queryString String
198 */
199 private void doSearching(final String queryString) {
200 try {
201 reader = IndexReader.open(indexPath);
202 String querieString = queryString.replaceAll("////","////");
203 querieString = querieString.replaceAll(":","////:");
204 querieString = "\""+querieString+"\"";
205 LOG.info("Debug: Query after adaption:" +querieString);
206 searcher = new IndexSearcher(indexPath);
207
208 querieString = USER_NAME+":"+querieString+" "+querieString;
209
210 query = QueryParser.parse(querieString, FIELD_NAME,
211 new StandardAnalyzer());
212
213
214
215
216
217
218 LOG.info("Debug: Reader: " + reader);
219 LOG.info("Debug: Query parsed: "+query.toString());
220 query = query.rewrite(reader);
221
222 LOG.info("Debug: Searching for: " +
223 query.toString(FIELD_NAME) + " in " + indexPath);
224
225 hits = searcher.search(query);
226 } catch (IOException e) {
227 LOG.debug(e);
228 } catch (ParseException e) {
229 LOG.debug(e);
230 }
231 }
232
233
234 /***
235 * @param startTag String
236 * @param endTag String
237 */
238 public void setHighlightTags(String startTag,String endTag) {
239 this.startTag = startTag;
240 this.endTag = endTag;
241 }
242
243 /***
244 *
245 * @param path String
246 * @return ContextSearcher
247 */
248 public static ContextSearcher createContextSearch(String path) {
249 if (contextSearch == null) {
250 contextSearch = new ContextSearcher(path);
251 }
252 return contextSearch;
253 }
254 }
255 /***
256 * $Log: ContextSearcher.java,v $
257 * Revision 1.7 2005/12/20 15:36:46 shally
258 * CheckStyle and PMD changes.
259 *
260 * Revision 1.6 2005/09/30 14:38:08 bavo_jcs
261 * Fixed URL
262 *
263 * Revision 1.5 2005/09/13 08:11:17 schauwvliege
264 * organize imports
265 *
266 * Revision 1.4 2005/09/06 13:25:29 schauwvliege
267 * fix
268 *
269 * Revision 1.3 2005/08/31 09:53:40 bavo_jcs
270 * Lucene delete fix
271 *
272 * Revision 1.2 2005/08/26 15:08:04 bavo_jcs
273 * Search fix
274 *
275 * Revision 1.1 2005/08/26 07:58:30 ge0ffrey
276 * split up the sources in service, serviceimpl and webclient
277 *
278 * Revision 1.7 2005/08/10 09:04:49 bavo_jcs
279 * Optimized imports according to checkstyle
280 *
281 * Revision 1.6 2005/08/09 12:59:55 bavo_jcs
282 * Optimized imports
283 *
284 * Revision 1.5 2005/07/12 10:42:39 bavo_jcs
285 * PDF Ajax integration, added user info
286 *
287 * Revision 1.4 2005/07/11 15:01:41 bavo_jcs
288 * PDF Ajax integration
289 *
290 * Revision 1.3 2005/06/14 12:05:53 schauwvliege
291 * CheckStyle and fixing tests
292 *
293 * Revision 1.2 2005/06/09 08:18:52 bejug_cc
294 * Fix initial import
295 *
296 * Revision 1.4 2005/06/07 14:38:48 bbr
297 * Lucene highlightterms added
298 *
299 * Revision 1.3 2005/06/07 09:46:13 bbr
300 * job source, webpath, indexing
301 *
302 * Revision 1.2 2005/05/23 17:33:53 sja
303 * Introduced File.separator.
304 *
305 * Revision 1.1 2005/05/23 17:09:48 sja
306 * Removed Lucene prefix.
307 *
308 * Revision 1.1 2005/05/23 15:42:00 bbr
309 * added weight to lucene
310 *
311 * Revision 1.1 2005/05/23 08:46:33 PSONG09
312 * added feeder source files to project
313 *
314 * Revision 1.3 2005/05/23 07:10:18 stephan_janssen
315 * Code cleanup.
316 *
317 * Revision 1.2 2005/05/20 07:45:29 bavo_jcs
318 * -lucene changes
319 *
320 * Revision 1.1 2005/05/18 15:46:39 bavo_jcs
321 * -adeed lucene service
322 *
323 */