1 /***
2 Copyright (C) 2005 The Java Community
3
4 This program is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free Software
6 Foundation; either version 2 of the License, or (at your option) any later
7 version.
8
9 This program is distributed in the hope that it will be useful, but WITHOUT
10 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License along with
14 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 Place, Suite 330, Boston, MA 02111-1307 USA.
16 */
17 package org.bejug.javacareers.jobs.search.lucene;
18
19 import org.apache.commons.logging.Log;
20 import org.apache.commons.logging.LogFactory;
21 import org.apache.commons.io.IOUtils;
22 import org.apache.lucene.analysis.SimpleAnalyzer;
23 import org.apache.lucene.document.Document;
24 import org.apache.lucene.document.Field;
25 import org.apache.lucene.index.IndexReader;
26 import org.apache.lucene.index.IndexWriter;
27 import org.apache.lucene.index.Term;
28 import org.pdfbox.pdmodel.PDDocument;
29 import org.pdfbox.util.PDFTextStripper;
30
31 import java.io.*;
32 import java.util.Date;
33
34 /***
35 * @author Bavo Bruylandt (Last modified by $Author: shally $)
36 * @version $Revision: 1.8 $ - $Date: 2005/12/20 15:36:46 $
37 */
38 public class PdfIndexer {
39
40 /***
41 * The class logger.
42 */
43 private static final Log LOG = LogFactory.getLog(PdfIndexer.class);
44
45 /***
46 *
47 */
48 private static String indexPath;
49
50 /***
51 * @param subpath String
52 * @throws PdfException if an error
53 */
54 private PdfIndexer(String subpath) throws PdfException {
55 LOG.info("Debug: Getting pdf indexer for " + subpath);
56 indexPath = new File(subpath + File.separator).getAbsolutePath();
57 LOG.info("Debug: Getting pdf indexer for " + indexPath);
58 createIndex(indexPath);
59 }
60
61 /***
62 *
63 * @return indexPath
64 */
65 public static String getIndexPath() {
66 return indexPath;
67 }
68
69 /***
70 * @param path String
71 * @throws PdfException if an error
72 */
73 private void createIndex(String path) throws PdfException {
74 LOG.info("Debug: path = " + path);
75
76 IndexWriter writer = null;
77
78
79
80 try {
81 writer = new IndexWriter(indexPath, null, true);
82 }
83 catch (IOException e) {
84 LOG.error(e);
85 throw new PdfException(e);
86 }
87 finally {
88 if (writer != null) {
89 try {
90 writer.close();
91 }
92 catch (IOException e) {
93 LOG.error(e);
94 }
95 }
96 }
97 }
98
99 /***
100 * @param file String
101 * @throws PdfException if an error
102 */
103 public void writeToIndex(String file) throws PdfException {
104 IndexWriter writer = null;
105 InputStream is = null;
106
107 try {
108 writer = new IndexWriter(indexPath, new SimpleAnalyzer(), false);
109 LOG.info("Debug: Indexing file " + file);
110 is = new FileInputStream(file);
111 Document doc = new Document();
112 doc.add(Field.Text("path", file));
113 doc.add(Field.Text("body", new InputStreamReader(is)));
114 writer.addDocument(doc);
115 }
116 catch (IOException e) {
117 LOG.debug(e);
118 }
119 finally {
120 IOUtils.closeQuietly(is);
121
122 try {
123 if (writer != null) {
124 writer.close();
125 }
126 }
127 catch (IOException e) {
128 LOG.error(e);
129 }
130 }
131 }
132
133 /***
134 * @param file String containing the fileName.
135 * @param user a String indicating the user who is indexing.
136 * @throws PdfException thrown when an error occcured.
137 */
138 public void writePdfToIndex(String file, String user) throws PdfException {
139
140 IndexWriter writer = null;
141 InputStream is = null;
142 PDDocument pddoc = null;
143
144 try {
145 writer = new IndexWriter(indexPath, new SimpleAnalyzer(), false);
146 File pdffile = new File(file);
147 LOG.info("Debug: Reading PDF file for index: " + pdffile.getAbsolutePath());
148 is = new FileInputStream(pdffile);
149
150 pddoc = PDDocument.load(is);
151
152 PDFTextStripper stripper = new PDFTextStripper();
153 String text = stripper.getText(pddoc);
154 Document doc = new Document();
155 doc.add(Field.Keyword("path", pdffile.getAbsolutePath()));
156 doc.add(Field.Text("date", new Date().toString()));
157 doc.add(Field.Text("body", text));
158 doc.add(Field.Text("user", user));
159 LOG.info("Debug: Writing stripped file to writer at: " + indexPath);
160 writer.addDocument(doc);
161 }
162 catch (IOException e) {
163 LOG.debug(e);
164 throw new PdfException(e);
165 }
166 finally {
167 IOUtils.closeQuietly(is);
168
169 try {
170 if (pddoc != null) {
171 pddoc.close();
172 }
173 if (writer != null) {
174 writer.close();
175 }
176 }
177 catch (IOException e) {
178 LOG.debug(e);
179 }
180
181 }
182 }
183
184 /***
185 * @param user String
186 * @throws PdfException if an error
187 */
188 public void deletePdfFromIndex(String user) throws PdfException {
189 IndexReader reader = null;
190 try {
191 Term term = new Term("user",user);
192 LOG.info("Trying to delete from index: "+term.text());
193 reader = IndexReader.open(indexPath);
194 int deleted = reader.delete(term);
195 LOG.info("Deleted sucessfully "+deleted+" files");
196
197 }
198 catch (IOException e) {
199 LOG.debug(e);
200 throw new PdfException(e);
201 }
202 finally {
203 try {
204 if (reader != null) {
205 reader.close();
206 }
207 }
208 catch (IOException e) {
209 LOG.error(e);
210 }
211
212 }
213
214 }
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255 /***
256 // * @param data String
257 // * @param file String
258 // */
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283 /***
284 * @param subpath String
285 * @return PdfIndexer
286 * @throws PdfException if an error
287 */
288 public static PdfIndexer createPdfIndexer(String subpath)
289 throws PdfException {
290 return new PdfIndexer(subpath);
291 }
292 }
293
294 /***
295 * $Log: PdfIndexer.java,v $
296 * Revision 1.8 2005/12/20 15:36:46 shally
297 * CheckStyle and PMD changes.
298 *
299 * Revision 1.7 2005/12/09 10:46:55 shally
300 * Opkuis voor checkstyle en PMD
301 *
302 * Revision 1.6 2005/10/11 09:03:02 stephan_janssen
303 * Introduced IOUtils.
304 *
305 * Revision 1.5 2005/09/30 14:38:08 bavo_jcs
306 * Fixed URL
307 *
308 * Revision 1.4 2005/09/06 13:25:29 schauwvliege
309 * fix
310 *
311 * Revision 1.3 2005/08/31 09:53:40 bavo_jcs
312 * Lucene delete fix
313 *
314 * Revision 1.2 2005/08/26 15:08:04 bavo_jcs
315 * Search fix
316 *
317 * Revision 1.1 2005/08/26 07:58:30 ge0ffrey
318 * split up the sources in service, serviceimpl and webclient
319 *
320 * Revision 1.9 2005/08/25 15:12:42 bavo_jcs
321 * Lucene file removal
322 *
323 * Revision 1.8 2005/08/17 09:12:00 schauwvliege
324 * Checkstyle
325 *
326 * Revision 1.7 2005/08/10 09:04:49 bavo_jcs
327 * Optimized imports according to checkstyle
328 *
329 * Revision 1.6 2005/08/09 12:59:56 bavo_jcs
330 * Optimized imports
331 *
332 * Revision 1.5 2005/08/08 09:38:23 bme_jcs
333 * resolved checkstyle errors
334 *
335 * Revision 1.4 2005/07/12 10:42:39 bavo_jcs
336 * PDF Ajax integration, added user info
337 *
338 * Revision 1.3 2005/06/14 12:05:53 schauwvliege
339 * CheckStyle and fixing tests
340 *
341 * Revision 1.2 2005/06/09 08:18:52 bejug_cc
342 * Fix initial import
343 *
344 * Revision 1.4 2005/05/31 08:47:28 bbr
345 * lucene fix
346 *
347 * Revision 1.3 2005/05/25 13:13:21 bbr
348 * lucene bean
349 *
350 * Revision 1.2 2005/05/23 17:33:31 sja
351 * Introduced File.separator.
352 *
353 * Revision 1.1 2005/05/23 17:09:48 sja
354 * Removed Lucene prefix.
355 *
356 * Revision 1.1 2005/05/23 15:42:00 bbr
357 * added weight to lucene
358 *
359 * Revision 1.1 2005/05/23 08:46:33 PSONG09
360 * added feeder source files to project
361 *
362 * Revision 1.3 2005/05/23 07:10:18 stephan_janssen
363 * Code cleanup.
364 *
365 * Revision 1.2 2005/05/20 07:45:38 bavo_jcs
366 * -lucene changes
367 *
368 * Revision 1.1 2005/05/18 15:46:39 bavo_jcs
369 * -adeed lucene service
370 *
371 */