View Javadoc

1   /***
2    Copyright (C) 2005 The Java Community
3   
4    This program is free software; you can redistribute it and/or modify  it under
5    the terms of the GNU General Public License as published by  the Free Software
6    Foundation; either version 2 of the License, or  (at your option) any later
7    version.
8   
9    This program is distributed in the hope that it will be useful,  but WITHOUT
10   ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS
11   FOR A PARTICULAR PURPOSE. See the  GNU General Public License for more details.
12  
13   You should have received a copy of the GNU General Public License  along with
14   this program; if not, write to the Free Software  Foundation, Inc., 59 Temple
15   Place, Suite 330, Boston, MA 02111-1307 USA.
16   */
17  package org.bejug.javacareers.jobs.search.lucene;
18  
19  /***
20   * Copyright 2002-2004 The Apache Software Foundation
21   *
22   * Licensed under the Apache License, Version 2.0 (the "License");
23   * you may not use this file except in compliance with the License.
24   * You may obtain a copy of the License at
25   *
26   *     http://www.apache.org/licenses/LICENSE-2.0
27   *
28   * Unless required by applicable law or agreed to in writing, software
29   * distributed under the License is distributed on an "AS IS" BASIS,
30   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
31   * See the License for the specific language governing permissions and
32   * limitations under the License.
33   */
34  import java.io.File;
35  import java.io.IOException;
36  import java.io.StringReader;
37  import java.util.ArrayList;
38  import java.util.List;
39  
40  import org.apache.commons.logging.Log;
41  import org.apache.commons.logging.LogFactory;
42  import org.apache.lucene.analysis.Analyzer;
43  import org.apache.lucene.analysis.TokenStream;
44  import org.apache.lucene.analysis.standard.StandardAnalyzer;
45  import org.apache.lucene.document.Document;
46  import org.apache.lucene.index.IndexReader;
47  import org.apache.lucene.queryParser.ParseException;
48  import org.apache.lucene.queryParser.QueryParser;
49  import org.apache.lucene.search.Hits;
50  import org.apache.lucene.search.IndexSearcher;
51  import org.apache.lucene.search.Query;
52  import org.apache.lucene.search.Searcher;
53  import org.apache.lucene.search.highlight.Formatter;
54  import org.apache.lucene.search.highlight.Highlighter;
55  import org.apache.lucene.search.highlight.QueryScorer;
56  import org.apache.lucene.search.highlight.SimpleFragmenter;
57  import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
58  import org.apache.lucene.search.highlight.TokenGroup;
59  
60  /***
61   * Adapted from JUnit Test for Highlighter class by mark@searcharea.co.uk
62   *
63   * @author Bavo Bruylandt (Last modified by $Author: shally $)
64   * @version $Revision: 1.7 $ - $Date: 2005/12/20 15:36:46 $
65   */
66  public class ContextSearcher implements Formatter {
67  
68      private static final Log LOG = LogFactory.getLog(ContextSearcher.class);
69      private static final String FIELD_NAME = "body";
70      private static final String PATH_NAME = "path";
71      private static final String USER_NAME = "user";
72  
73      private static ContextSearcher contextSearch;
74  
75      private IndexReader reader;
76      private Query query;
77      private Searcher searcher;
78      private Hits hits;
79      private String indexPath;
80  
81      private Analyzer analyzer = new StandardAnalyzer();
82      private String endTag = "<b>";
83      private String startTag = "</b>";
84  
85  
86      /***
87       * Constructor for ContextSearcher.
88       *
89       * @param path Path to file
90       */
91      private ContextSearcher(String path) {
92          File indexPathFile = new File(path);
93          indexPath = indexPathFile.getAbsolutePath() + File.separator;
94          if (!indexPathFile.exists() || !indexPathFile.isDirectory()) {
95              LOG.info("Debug: IndexPath didnt exist: "+path+ " creating indexer first...");
96              try {
97                  PdfIndexer.createPdfIndexer(path);
98                  LOG.info("Debug: Indexer created");
99              } catch (PdfException e) {
100                 LOG.error(e);
101             }
102         }
103     }
104 
105     /***
106      *
107      * @param searchString String
108      * @param contextLength int
109      * @return List
110      * @throws PdfException if an error
111      */
112     public List getContext(String searchString, int contextLength)
113             throws PdfException {
114 
115         if (contextLength < 0) {
116             throw new IllegalArgumentException("contextlength < 0");
117         }
118         List list = new ArrayList();
119 
120         doSearching(searchString);
121         Formatter formatter = new SimpleHTMLFormatter(startTag,endTag);
122         Highlighter highlighter = new Highlighter(formatter,new QueryScorer(query));
123 
124         highlighter.setTextFragmenter(new SimpleFragmenter(contextLength));
125         int maxNumFragmentsRequired = 2;
126         StringBuffer allresults = new StringBuffer();
127         LOG.info("Debug: Searching..."+searchString+" "+contextLength);
128         for (int i = 0; i < hits.length(); i++) {
129             String text = null;
130             String path = null;
131             String user = null;
132             Document doc = null;
133             double score = -1;
134             try {
135                 doc = hits.doc(i);
136                 text = doc.get(FIELD_NAME);
137                 path = doc.get(PATH_NAME);
138                 user = doc.get(USER_NAME);
139                 score = hits.score(i);
140             } catch (IOException e) {
141                 LOG.debug(e);
142             }
143 
144             TokenStream tokenStream = analyzer.tokenStream(
145                     FIELD_NAME, new StringReader(text));
146 
147             // Execute highlighter.
148             String result = null;
149             try {
150                 result = highlighter.getBestFragments(
151                         tokenStream, text, maxNumFragmentsRequired, "...");
152 
153             } catch (IOException e) {
154                 LOG.debug(e);
155             }
156 
157             allresults.append(result);
158             SearchResult searchResult = new SearchResultImpl();
159             searchResult.addContext(result);
160             searchResult.setFile(path);
161             searchResult.setQuery(query.toString());
162             searchResult.setWeight(score);
163             searchResult.setUser(user);
164             list.add(searchResult);
165         }
166         try {
167             if (reader != null) {
168                 reader.close();
169             }
170             if (searcher != null) {
171                 searcher.close();
172             }
173         } catch (IOException e) {
174             LOG.debug(e);
175         }
176         LOG.info("Debug: Serach list: "+list);
177         LOG.info("Debug: Found: "+list.size()+" items");
178         return list;
179     }
180 
181     /***
182      *
183      * @param originalText String
184      * @param group TokenGroup
185      * @return text string
186      */
187     public String highlightTerm(String originalText, TokenGroup group) {
188         if (group.getTotalScore() <= 0) {
189             return originalText;
190         }
191 
192         return "" + originalText + "";
193     }
194 
195     /***
196      *
197      * @param queryString String
198      */
199     private void doSearching(final String queryString) {
200         try {
201             reader = IndexReader.open(indexPath);
202             String querieString = queryString.replaceAll("////","////");
203             querieString = querieString.replaceAll(":","////:");
204             querieString = "\""+querieString+"\"";
205             LOG.info("Debug: Query after adaption:" +querieString);
206             searcher = new IndexSearcher(indexPath);
207 
208             querieString = USER_NAME+":"+querieString+" "+querieString;
209 
210             query = QueryParser.parse(querieString, FIELD_NAME,
211                                       new StandardAnalyzer());
212             /*query = QueryParser.parse(querieString, FIELD_NAME,
213                                       new StandardAnalyzer());
214                                       */
215             //for any multi-term queries to work
216             // (prefix, wildcard, range,fuzzy etc) you must use a rewritten
217             // query!
218             LOG.info("Debug: Reader: " + reader);
219             LOG.info("Debug: Query parsed: "+query.toString());
220             query = query.rewrite(reader);
221 
222             LOG.info("Debug: Searching for: " +
223                     query.toString(FIELD_NAME) + " in " + indexPath);
224 
225             hits = searcher.search(query);
226         } catch (IOException e) {
227             LOG.debug(e);
228         } catch (ParseException e) {
229             LOG.debug(e);
230         }
231     }
232 
233    
234     /***
235      * @param startTag String
236      * @param endTag String
237      */
238     public void setHighlightTags(String startTag,String endTag) {
239         this.startTag = startTag;
240         this.endTag = endTag;
241     }
242 
243     /***
244      *
245      * @param path String
246      * @return ContextSearcher 
247      */
248     public static ContextSearcher createContextSearch(String path) {
249         if (contextSearch == null) {
250             contextSearch = new ContextSearcher(path);
251         }
252         return contextSearch;
253     }
254 }
255 /***
256  * $Log: ContextSearcher.java,v $
257  * Revision 1.7  2005/12/20 15:36:46  shally
258  * CheckStyle and PMD changes.
259  *
260  * Revision 1.6  2005/09/30 14:38:08  bavo_jcs
261  * Fixed URL
262  *
263  * Revision 1.5  2005/09/13 08:11:17  schauwvliege
264  * organize imports
265  *
266  * Revision 1.4  2005/09/06 13:25:29  schauwvliege
267  * fix
268  *
269  * Revision 1.3  2005/08/31 09:53:40  bavo_jcs
270  * Lucene delete fix
271  *
272  * Revision 1.2  2005/08/26 15:08:04  bavo_jcs
273  * Search fix
274  *
275  * Revision 1.1  2005/08/26 07:58:30  ge0ffrey
276  * split up the sources in service, serviceimpl and webclient
277  *
278  * Revision 1.7  2005/08/10 09:04:49  bavo_jcs
279  * Optimized imports according to checkstyle
280  *
281  * Revision 1.6  2005/08/09 12:59:55  bavo_jcs
282  * Optimized imports
283  *
284  * Revision 1.5  2005/07/12 10:42:39  bavo_jcs
285  * PDF Ajax integration, added user info
286  *
287  * Revision 1.4  2005/07/11 15:01:41  bavo_jcs
288  * PDF Ajax integration
289  *
290  * Revision 1.3  2005/06/14 12:05:53  schauwvliege
291  * CheckStyle and fixing tests
292  *
293  * Revision 1.2  2005/06/09 08:18:52  bejug_cc
294  * Fix initial import
295  *
296  * Revision 1.4  2005/06/07 14:38:48  bbr
297  * Lucene highlightterms added
298  *
299  * Revision 1.3  2005/06/07 09:46:13  bbr
300  * job source, webpath, indexing
301  *
302  * Revision 1.2  2005/05/23 17:33:53  sja
303  * Introduced File.separator.
304  *
305  * Revision 1.1  2005/05/23 17:09:48  sja
306  * Removed Lucene prefix.
307  *
308  * Revision 1.1  2005/05/23 15:42:00  bbr
309  * added weight to lucene
310  *
311  * Revision 1.1  2005/05/23 08:46:33  PSONG09
312  * added feeder source files to project
313  *
314  * Revision 1.3  2005/05/23 07:10:18  stephan_janssen
315  * Code cleanup.
316  *
317  * Revision 1.2  2005/05/20 07:45:29  bavo_jcs
318  * -lucene changes
319  *
320  * Revision 1.1  2005/05/18 15:46:39  bavo_jcs
321  * -adeed lucene service
322  *
323  */