View Javadoc

1   /***
2    Copyright (C) 2005 The Java Community
3   
4    This program is free software; you can redistribute it and/or modify  it under
5    the terms of the GNU General Public License as published by  the Free Software
6    Foundation; either version 2 of the License, or  (at your option) any later
7    version.
8   
9    This program is distributed in the hope that it will be useful,  but WITHOUT
10   ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS
11   FOR A PARTICULAR PURPOSE. See the  GNU General Public License for more details.
12  
13   You should have received a copy of the GNU General Public License  along with
14   this program; if not, write to the Free Software  Foundation, Inc., 59 Temple
15   Place, Suite 330, Boston, MA 02111-1307 USA.
16   */
17  package org.bejug.javacareers.jobs.search.lucene;
18  
19  import org.apache.commons.logging.Log;
20  import org.apache.commons.logging.LogFactory;
21  import org.apache.commons.io.IOUtils;
22  import org.apache.lucene.analysis.SimpleAnalyzer;
23  import org.apache.lucene.document.Document;
24  import org.apache.lucene.document.Field;
25  import org.apache.lucene.index.IndexReader;
26  import org.apache.lucene.index.IndexWriter;
27  import org.apache.lucene.index.Term;
28  import org.pdfbox.pdmodel.PDDocument;
29  import org.pdfbox.util.PDFTextStripper;
30  
31  import java.io.*;
32  import java.util.Date;
33  
34  /***
35   * @author Bavo Bruylandt (Last modified by $Author: shally $)
36   * @version $Revision: 1.8 $ - $Date: 2005/12/20 15:36:46 $
37   */
38  public class PdfIndexer {
39  
40      /***
41       * The class logger.
42       */
43      private static final Log LOG = LogFactory.getLog(PdfIndexer.class);
44  
45      /***
46       *
47       */
48      private static String indexPath;
49  
50      /***
51       * @param subpath String
52       * @throws PdfException if an error
53       */
54      private PdfIndexer(String subpath) throws PdfException {
55          LOG.info("Debug: Getting pdf indexer for " + subpath);
56          indexPath = new File(subpath + File.separator).getAbsolutePath();
57          LOG.info("Debug: Getting pdf indexer for " + indexPath);
58          createIndex(indexPath);
59      }
60  
61      /***
62       * 
63       * @return indexPath
64       */
65      public static String getIndexPath() {
66          return indexPath;
67      }
68  
69      /***
70       * @param path String
71       * @throws PdfException if an error
72       */
73      private void createIndex(String path) throws PdfException {
74          LOG.info("Debug: path = " + path);
75          //String indexPath = path;
76          IndexWriter writer = null;
77  
78          // An index is created by opening an IndexWriter with the
79          // create argument set to true.
80          try {
81              writer = new IndexWriter(indexPath, null, true);
82          }
83          catch (IOException e) {
84              LOG.error(e);
85              throw new PdfException(e);
86          }
87          finally {            
88              if (writer != null) {
89                  try {
90                      writer.close();
91                  }
92                  catch (IOException e) {
93                      LOG.error(e);
94                  }
95              }
96          }
97      }
98  
99      /***
100      * @param file String
101      * @throws PdfException if an error
102      */
103     public void writeToIndex(String file) throws PdfException {
104         IndexWriter writer = null;
105         InputStream is = null;
106 
107         try {
108             writer = new IndexWriter(indexPath, new SimpleAnalyzer(), false);
109             LOG.info("Debug: Indexing file " + file);
110             is = new FileInputStream(file);
111             Document doc = new Document();
112             doc.add(Field.Text("path", file));
113             doc.add(Field.Text("body", new InputStreamReader(is)));
114             writer.addDocument(doc);
115         }
116         catch (IOException e) {
117             LOG.debug(e);
118         }
119         finally {
120             IOUtils.closeQuietly(is);
121 
122             try {
123                 if (writer != null) {
124                     writer.close();
125                 }
126             }
127             catch (IOException e) {
128                 LOG.error(e);
129             }
130         }
131     }
132 
133     /***
134      * @param file String containing the fileName.
135      * @param user a String indicating the user who is indexing.
136      * @throws PdfException thrown when an error occcured.
137      */
138     public void writePdfToIndex(String file, String user) throws PdfException {
139 
140         IndexWriter writer = null;
141         InputStream is = null;
142         PDDocument pddoc = null;
143 
144         try {
145             writer = new IndexWriter(indexPath, new SimpleAnalyzer(), false);
146             File pdffile = new File(file);
147             LOG.info("Debug: Reading PDF file for index: " + pdffile.getAbsolutePath());
148             is = new FileInputStream(pdffile);
149 
150             pddoc = PDDocument.load(is);
151 
152             PDFTextStripper stripper = new PDFTextStripper();
153             String text = stripper.getText(pddoc);
154             Document doc = new Document();
155             doc.add(Field.Keyword("path", pdffile.getAbsolutePath()));
156             doc.add(Field.Text("date", new Date().toString()));
157             doc.add(Field.Text("body", text));
158             doc.add(Field.Text("user", user));
159             LOG.info("Debug: Writing stripped file to writer at: " + indexPath);
160             writer.addDocument(doc);
161         }
162         catch (IOException e) {
163             LOG.debug(e);
164             throw new PdfException(e);
165         }
166         finally {
167             IOUtils.closeQuietly(is);
168 
169             try {
170                 if (pddoc != null) {
171                     pddoc.close();
172                 }
173                 if (writer != null) {
174                     writer.close();
175                 }
176             }
177             catch (IOException e) {
178                 LOG.debug(e);
179             }
180 
181         }
182     }
183 
184     /***
185      * @param user String
186      * @throws PdfException if an error
187      */
188     public void deletePdfFromIndex(String user) throws PdfException {
189         IndexReader reader = null;
190         try {
191             Term term = new Term("user",user);
192             LOG.info("Trying to delete from index: "+term.text());
193             reader = IndexReader.open(indexPath);
194             int deleted = reader.delete(term);
195             LOG.info("Deleted sucessfully "+deleted+" files");
196 
197         }
198         catch (IOException e) {
199             LOG.debug(e);
200             throw new PdfException(e);
201         }
202         finally {
203             try {
204                 if (reader != null) {
205                     reader.close();
206                 }
207             }
208             catch (IOException e) {
209                 LOG.error(e);
210             }
211 
212         }
213 
214     }
215     /*
216     private void searchKeyword(String indexPath, String queryString) {
217 
218 
219         System.out.println("Searching for: " + queryString);
220 
221         Searcher searcher = null;
222         try {
223             searcher = new IndexSearcher(indexPath);
224             //Query query = QueryParser.parse(queryString, "summary", new SimpleAnalyzer());
225             Query query = QueryParser.parse(queryString, "body", new SimpleAnalyzer());
226 
227             System.out.println("Parsed query: " + query.toString());
228             Hits hits = searcher.search(query);
229 
230             for (int i = 0; i < hits.length(); i++) {
231                 Document doc = hits.doc(i);
232                 System.out.println(doc.get("path") + "; Score: " + hits.score(i));
233                 Enumeration fields = doc.fields();
234 
235 
236                 while (fields.hasMoreElements()) {
237                     Object o = (Field) fields.nextElement();
238                     System.out.println("field: "+o.toString());
239                 }
240 
241             }
242 
243         }
244         catch (IOException e) {
245             e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
246         }
247         catch (ParseException e) {
248             e.printStackTrace();  //To change body of catch statement use File | Settings | File Templates.
249         }
250 
251     }
252 
253      */
254 
255 //    /***
256 //     * @param data String
257 //     * @param file String
258 //     */
259 //    private void writeDataFile(String data, String file) {
260 //        FileWriter writer = null;
261 //        try {
262 //            writer = new FileWriter(new File(file));
263 //            writer.write(data);
264 //        }
265 //        catch (FileNotFoundException e) {
266 //            LOG.error(e);
267 //        }
268 //        catch (IOException e) {
269 //            LOG.error(e);
270 //        }
271 //        finally {
272 //            try {
273 //                if (writer != null) {
274 //                    writer.close();
275 //                }
276 //            }
277 //            catch (IOException e) {
278 //                LOG.error(e);
279 //            }
280 //        }
281 //    }
282 
283     /***
284      * @param subpath String
285      * @return PdfIndexer
286      * @throws PdfException if an error
287      */
288     public static PdfIndexer createPdfIndexer(String subpath)
289             throws PdfException {
290         return new PdfIndexer(subpath);
291     }
292 }
293 
294 /***
295  * $Log: PdfIndexer.java,v $
296  * Revision 1.8  2005/12/20 15:36:46  shally
297  * CheckStyle and PMD changes.
298  *
299  * Revision 1.7  2005/12/09 10:46:55  shally
300  * Opkuis voor checkstyle en PMD
301  *
302  * Revision 1.6  2005/10/11 09:03:02  stephan_janssen
303  * Introduced IOUtils.
304  *
305  * Revision 1.5  2005/09/30 14:38:08  bavo_jcs
306  * Fixed URL
307  *
308  * Revision 1.4  2005/09/06 13:25:29  schauwvliege
309  * fix
310  *
311  * Revision 1.3  2005/08/31 09:53:40  bavo_jcs
312  * Lucene delete fix
313  *
314  * Revision 1.2  2005/08/26 15:08:04  bavo_jcs
315  * Search fix
316  *
317  * Revision 1.1  2005/08/26 07:58:30  ge0ffrey
318  * split up the sources in service, serviceimpl and webclient
319  *
320  * Revision 1.9  2005/08/25 15:12:42  bavo_jcs
321  * Lucene file removal
322  *
323  * Revision 1.8  2005/08/17 09:12:00  schauwvliege
324  * Checkstyle
325  *
326  * Revision 1.7  2005/08/10 09:04:49  bavo_jcs
327  * Optimized imports according to checkstyle
328  *
329  * Revision 1.6  2005/08/09 12:59:56  bavo_jcs
330  * Optimized imports
331  *
332  * Revision 1.5  2005/08/08 09:38:23  bme_jcs
333  * resolved checkstyle errors
334  *
335  * Revision 1.4  2005/07/12 10:42:39  bavo_jcs
336  * PDF Ajax integration, added user info
337  *
338  * Revision 1.3  2005/06/14 12:05:53  schauwvliege
339  * CheckStyle and fixing tests
340  *
341  * Revision 1.2  2005/06/09 08:18:52  bejug_cc
342  * Fix initial import
343  *
344  * Revision 1.4  2005/05/31 08:47:28  bbr
345  * lucene fix
346  *
347  * Revision 1.3  2005/05/25 13:13:21  bbr
348  * lucene bean
349  *
350  * Revision 1.2  2005/05/23 17:33:31  sja
351  * Introduced File.separator.
352  *
353  * Revision 1.1  2005/05/23 17:09:48  sja
354  * Removed Lucene prefix.
355  *
356  * Revision 1.1  2005/05/23 15:42:00  bbr
357  * added weight to lucene
358  *
359  * Revision 1.1  2005/05/23 08:46:33  PSONG09
360  * added feeder source files to project
361  *
362  * Revision 1.3  2005/05/23 07:10:18  stephan_janssen
363  * Code cleanup.
364  *
365  * Revision 1.2  2005/05/20 07:45:38  bavo_jcs
366  * -lucene changes
367  *
368  * Revision 1.1  2005/05/18 15:46:39  bavo_jcs
369  * -adeed lucene service
370  *
371  */